In [1]:
import pandas as pd
import numpy as np
from functions import *

Municipal tourism capacity (2013-2024) (https://esploradati.istat.it/databrowser/#/it/dw/categories/IT1,Z0700SER,1.0/SER_TOURISM/SER_TOURISM_RELATED_FILES)

In [2]:
file_path = 'C:/Users/HP/Desktop/Traineeship/data/tourism/DCSC_Capacity_ of_tourist_accommodation_municipal/Capacità comunale 2013-2024.xlsx'

xls = pd.ExcelFile(file_path)

sheets = pd.read_excel(xls, sheet_name=None)

print(sheets.keys())

dict_keys(['Index', '2024', '2023', '2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013'])


Merging .xlsx sheets

In [3]:
# New column names
col_names = [
    "region","reg_code","prov","prov_code","mun_name", "mun_code", "mun_istat",
    "five_stars_count","five_stars_beds","five_stars_bedrooms","five_stars_bathrooms",
    "four_stars_count","four_stars_beds","four_stars_bedrooms","four_stars_bathrooms",
    "three_stars_count","three_stars_beds","three_stars_bedrooms","three_stars_bathrooms",
    "two_stars_count","two_stars_beds","two_stars_bedrooms","two_stars_bathrooms",
    "one_stars_count","one_stars_beds","one_stars_bedrooms","one_stars_bathrooms",
    "tourism_residence_count","tourism_residence_beds","tourism_residence_bedrooms","tourism_residence_bathrooms",
    "camp_sites_count","camp_sites_beds","holiday_dwelling_count","holiday_dwelling_beds",
    "farmhouses_count","farmhouses_beds","hostels_count","hostels_beds",
    "tourist_dormitories_count","tourist_dormitories_beds","mountain_huts_count","mountain_huts_beds",
    "others_count","others_beds","bnb_counts","bnb_beds",
    "tot_acc_count","tot_acc_beds","tot_count","tot_beds"
]

# Rows to skip
rows_to_skip = 4

# Columns to drop
cols_to_drop = [31, 32, 33, 34]

# Exclude the 'Index' sheet
sheets_to_process = [s for s in xls.sheet_names if s.lower() != 'index']

df_all = merge_xlsx(file_path, sheets_to_process, cols_to_drop, col_names, rows_to_skip)

df_all.info()

Successfully combined 12 sheets. Final shape: (96961, 52)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96961 entries, 0 to 96960
Data columns (total 52 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   region                       96893 non-null  object 
 1   reg_code                     96890 non-null  float64
 2   prov                         96890 non-null  object 
 3   prov_code                    96890 non-null  float64
 4   mun_name                     96890 non-null  object 
 5   mun_code                     96783 non-null  float64
 6   mun_istat                    95594 non-null  float64
 7   five_stars_count             96902 non-null  float64
 8   five_stars_beds              96902 non-null  float64
 9   five_stars_bedrooms          96902 non-null  float64
 10  five_stars_bathrooms         96902 non-null  float64
 11  four_stars_count             96902 non-null  float64
 12  four_stars_beds 

Data cleaning

In [4]:
# Delete aggregate rows
df_clean = df_all[~df_all['mun_name'].isin(['TOTALE', 'TOTALE ITALIA'])]

# Print number of deleted rows
print('Number of deleted rows:', len(df_all) - len(df_clean))

Number of deleted rows: 1308


In [5]:
# Place 'year' as the first column
df_clean.insert(0, 'year', df_clean.pop('year'))

# Convert 'year' to int for filtering
df_clean['year'] = df_clean['year'].astype(int)

# Select the period 2014-2024
df_clean = df_clean[df_clean['year'] >= 2014]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['year'] = df_clean['year'].astype(int)


In [6]:
# Convert floats to ints
int_columns = df_clean.loc[:, ~df_clean.columns.isin(['year', 'region', 'mun_name','prov'])]

df_clean[int_columns.columns] = int_columns.fillna(0).astype(int)

The values of mun_istat should have a length of 6.

In [7]:
# Add missing zeroes 
add_zeroes(df_clean, ['mun_istat'], 6)

Unnamed: 0,year,region,reg_code,prov,prov_code,mun_name,mun_code,mun_istat,five_stars_count,five_stars_beds,...,mountain_huts_count,mountain_huts_beds,others_count,others_beds,bnb_counts,bnb_beds,tot_acc_count,tot_acc_beds,tot_count,tot_beds
0,2024,PIEMONTE,10,TORINO,1,Agliè,1,001001,0,0,...,0,0,0,0,4,24,10,82,10,82
1,2024,PIEMONTE,10,TORINO,1,Airasca,2,001002,0,0,...,0,0,0,0,1,6,2,12,2,12
2,2024,PIEMONTE,10,TORINO,1,Ala di Stura,3,001003,0,0,...,0,0,0,0,2,10,4,78,7,184
3,2024,PIEMONTE,10,TORINO,1,Albiano d'Ivrea,4,001004,0,0,...,0,0,0,0,1,2,3,35,3,35
4,2024,PIEMONTE,10,TORINO,1,Almese,6,001006,0,0,...,0,0,0,0,1,2,3,22,3,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88750,2014,SARDEGNA,200,CARBONIA-IGLESIAS,107,Sant'Antioco,20,107020,0,0,...,0,0,0,0,22,94,32,979,40,1292
88751,2014,SARDEGNA,200,CARBONIA-IGLESIAS,107,Tratalias,21,107021,0,0,...,0,0,0,0,3,16,8,85,8,85
88752,2014,SARDEGNA,200,CARBONIA-IGLESIAS,107,Villamassargia,22,107022,0,0,...,0,0,0,0,4,21,7,51,7,51
88753,2014,SARDEGNA,200,CARBONIA-IGLESIAS,107,Villaperuccio,23,107023,0,0,...,0,0,0,0,2,10,3,16,3,16


Municipal tourism presence (2014-2024) (https://esploradati.istat.it/databrowser/#/it/dw/categories/IT1,Z0700SER,1.0/SER_TOURISM/SER_TOURISM_RELATED_FILES)

In [8]:
file_path = 'C:/Users/HP/Desktop/Traineeship/data/tourism/DCSC_Occupancy_in_collective_accommodation/2. Dati comunali 2014-2024.xlsx'

xls = pd.ExcelFile(file_path)

sheets = pd.read_excel(xls, sheet_name=None)

print(sheets.keys())

dict_keys(['Indice-Index', '2024', '2023', '2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', 'Dati Mensili-Monthly data', 'Classificazione', 'Brand', 'Comuni Classificazione-Brand'])


Merging .xlsx sheets

In [9]:
# New column names
col_names = [
    'reg_code','region','prov_code','prov','mun_name','mun_istat','tot_arrivals_residents','tot_arrivals_foreigners',
    'tot_arrivals','arrivals_hotel_residents','arrivals_hotel_foreigners','tot_arrivals_hotel','arrivals_acc_residents',
    'arrivals_acc_foreigners','tot_arrivals_acc','tot_nights_residents','tot_nights_foreigners','tot_nights',
    'nights_hotel_residents','nights_hotel_foreigners','tot_nights_hotel','nights_acc_residents','nights_acc_foreigners',
    'tot_nights_acc'
]

skiprows = 5

# Columns to drop
cols_to_drop = [6,16]

# Exclude useless sheets
excluded_sheets = ['indice-index', 'dati mensili-monthly data', 
                   'classificazione', 'brand', 'comuni classificazione-brand']

sheets_to_process = [
    s for s in xls.sheet_names 
    if s.lower() not in excluded_sheets
]

df_all_1 = merge_xlsx(file_path, sheets_to_process, cols_to_drop, col_names, skiprows)

df_all_1.info()

Successfully combined 11 sheets. Final shape: (42025, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42025 entries, 0 to 42024
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   reg_code                   42000 non-null  object 
 1   region                     41970 non-null  object 
 2   prov_code                  41792 non-null  object 
 3   prov                       41729 non-null  object 
 4   mun_name                   40541 non-null  object 
 5   mun_istat                  40237 non-null  object 
 6   tot_arrivals_residents     41970 non-null  float64
 7   tot_arrivals_foreigners    41970 non-null  object 
 8   tot_arrivals               41970 non-null  float64
 9   arrivals_hotel_residents   41970 non-null  object 
 10  arrivals_hotel_foreigners  41970 non-null  object 
 11  tot_arrivals_hotel         41970 non-null  object 
 12  arrivals_acc_residents     41970 non-null  o

Data cleaning

In [10]:
df_clean_1 = df_all_1.copy()

In [11]:
# drop rows with missing mun_istat
df_clean_1['mun_istat'] = df_clean_1['mun_istat'].replace(r'^\s*$', np.nan, regex=True)

df_clean_1 = df_clean_1.dropna(subset=['mun_istat'])

In [12]:
# add missing zeroes to mun_istat
df_clean_1['mun_istat'] = df_clean_1['mun_istat'].astype(float).astype(int)
add_zeroes(df_clean_1, ['mun_istat'], 6)

Unnamed: 0,reg_code,region,prov_code,prov,mun_name,mun_istat,tot_arrivals_residents,tot_arrivals_foreigners,tot_arrivals,arrivals_hotel_residents,...,tot_nights_residents,tot_nights_foreigners,tot_nights,nights_hotel_residents,nights_hotel_foreigners,tot_nights_hotel,nights_acc_residents,nights_acc_foreigners,tot_nights_acc,year
0,010,PIEMONTE,1.0,TORINO,Agliè,001001,163.0,144.0,307.0,0,...,259.0,235.0,494.0,0,0,0,259,235,494,2024
1,010,PIEMONTE,1.0,TORINO,Ala di Stura,001003,1825.0,81.0,1906.0,1240,...,6542.0,148.0,6690.0,4323,78,4401,2219,70,2289,2024
2,010,PIEMONTE,1.0,TORINO,Albiano d'Ivrea,001004,329.0,224.0,553.0,0,...,1109.0,518.0,1627.0,0,0,0,1109,518,1627,2024
3,010,PIEMONTE,1.0,TORINO,Almese,001006,56.0,29.0,85.0,0,...,193.0,71.0,264.0,0,0,0,193,71,264,2024
4,010,PIEMONTE,1.0,TORINO,Alpignano,001008,380.0,253.0,633.0,0,...,916.0,622.0,1538.0,0,0,0,916,622,1538,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42011,200,SARDEGNA,107,CARBONIA-IGLESIAS,San Giovanni Suergiu,107017,706.0,204.0,910.0,(*),...,2673.0,975.0,3648.0,(*),(*),(*),(*),(*),(*),2014
42012,200,SARDEGNA,107,CARBONIA-IGLESIAS,Sant'Anna Arresi,107019,9069.0,1967.0,11036.0,6471,...,52507.0,5347.0,57854.0,42128,3533,45661,10379,1814,12193,2014
42013,200,SARDEGNA,107,CARBONIA-IGLESIAS,Sant'Antioco,107020,7578.0,5256.0,12834.0,2993,...,28926.0,21336.0,50262.0,8273,1883,10156,20653,19453,40106,2014
42014,200,SARDEGNA,107,CARBONIA-IGLESIAS,Tratalias,107021,242.0,54.0,296.0,(*),...,1364.0,247.0,1611.0,(*),(*),(*),(*),(*),(*),2014


In [13]:
# '(*)' and ' (*) ' were used instead of NaNs, '-' instead of 0s. It must be sobstituted before proceding.
df_clean_1 = df_clean_1.replace({'(*)': np.nan, ' (*) ': np.nan, '-': 0})

  df_clean_1 = df_clean_1.replace({'(*)': np.nan, ' (*) ': np.nan, '-': 0})


In [14]:
# drop duplicate columns
col_to_drop = ['region','reg_code','prov','prov_code','mun_name']

df_clean_1 = df_clean_1.drop(columns = col_to_drop)

Merge df_clean and df_clean_1 on ['mun_istat', 'year']

In [15]:
# Ensure merging columns are of the same type in df_clean_1 is integer for merging
df_clean_1['year'] = df_clean_1['year'].astype(int)
df_clean['year'] = df_clean['year'].astype(int)

df_clean_1['mun_istat'] = df_clean_1['mun_istat'].astype(object)
df_clean['mun_istat'] = df_clean['mun_istat'].astype(object)

# Keep rows from df_clean (municipalities with 0 arrivals but with tourist infrastructures are saved)
df_merged = pd.merge(df_clean, df_clean_1, on=['year', 'mun_istat'], how='left')

In [16]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87560 entries, 0 to 87559
Data columns (total 70 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   year                         87560 non-null  int64  
 1   region                       87505 non-null  object 
 2   reg_code                     87560 non-null  int64  
 3   prov                         87502 non-null  object 
 4   prov_code                    87560 non-null  int64  
 5   mun_name                     87491 non-null  object 
 6   mun_code                     87560 non-null  int64  
 7   mun_istat                    87560 non-null  object 
 8   five_stars_count             87560 non-null  int64  
 9   five_stars_beds              87560 non-null  int64  
 10  five_stars_bedrooms          87560 non-null  int64  
 11  five_stars_bathrooms         87560 non-null  int64  
 12  four_stars_count             87560 non-null  int64  
 13  four_stars_beds 

In [17]:
# Keep only column of interest
new_order = [
    'year', "region", "mun_name", "mun_istat", "five_stars_count", "four_stars_count", "three_stars_count",
    "two_stars_count", "one_stars_count", "holiday_dwelling_count", "holiday_dwelling_beds", "tot_count", "tot_acc_beds", "tot_beds",
    "tot_arrivals_residents", "tot_arrivals_foreigners", "tot_arrivals","tot_nights_residents","tot_nights_foreigners","tot_nights"
]

df_merged = df_merged[new_order]

# delete rows with missing mun_name
df_merged = df_merged.dropna(subset = 'mun_name')

Impute 0s instead of nans in tourism cols (many municipalities have no tourism flow / infrastructures)

In [18]:
col = [
    'five_stars_count',      
    'four_stars_count',      
    'three_stars_count',     
    'two_stars_count',       
    'one_stars_count',       
    'holiday_dwelling_count', 
    'holiday_dwelling_beds', 
    'tot_count',             
    'tot_acc_beds',          
    'tot_beds',              
    'tot_arrivals_residents',
    'tot_arrivals_foreigners', 
    'tot_arrivals',          
    'tot_nights_residents',  
    'tot_nights_foreigners', 
    'tot_nights'
]

df_merged[col] = df_merged[col].replace({np.nan: 0})

In [19]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87491 entries, 0 to 87558
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     87491 non-null  int64  
 1   region                   87491 non-null  object 
 2   mun_name                 87491 non-null  object 
 3   mun_istat                87491 non-null  object 
 4   five_stars_count         87491 non-null  int64  
 5   four_stars_count         87491 non-null  int64  
 6   three_stars_count        87491 non-null  int64  
 7   two_stars_count          87491 non-null  int64  
 8   one_stars_count          87491 non-null  int64  
 9   holiday_dwelling_count   87491 non-null  int64  
 10  holiday_dwelling_beds    87491 non-null  int64  
 11  tot_count                87491 non-null  int64  
 12  tot_acc_beds             87491 non-null  int64  
 13  tot_beds                 87491 non-null  int64  
 14  tot_arrivals_residents   87

Save the dataset

In [20]:
# Save in .parquet to preserve data types
df_merged.to_parquet('datasets/mun_tourism.parquet', index=False)