In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import hashlib
from functions import normalize_name

Load the dataset (https://esploradati.istat.it/databrowser/#/it/dw/categories/IT1,POP,1.0/POP_MIGRATIONS/DCIS_MIGRAZIONI/IT1,28_185_DF_DCIS_MIGRAZIONI_4,1.0)

In [2]:
folder = "C:/Users/HP/Desktop/Traineeship/data/immigration"

# Grab all CSV files
all_files = glob.glob(os.path.join(folder, "*.csv"))

Read .csv files and concatenate them in df_imm

In [3]:
dfs = []

for i,f in enumerate(all_files):
    
    # Read CSV, skip first title line
    df = pd.read_csv(f, sep=',')

    dfs.append(df)

In [4]:
df_imm = pd.concat(dfs, ignore_index=True)

In [5]:
df_imm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9103 entries, 0 to 9102
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FREQ                        9103 non-null   object 
 1   Frequenza                   9103 non-null   object 
 2   REF_AREA                    9103 non-null   object 
 3   Territorio di origine       9103 non-null   object 
 4   DATA_TYPE                   9103 non-null   object 
 5   Indicatore                  9103 non-null   object 
 6   CHANGE_OF_RESIDENCE         9103 non-null   object 
 7   Tipo di trasferimento       9103 non-null   object 
 8   CITIZENSHIP                 9103 non-null   object 
 9   Cittadinanza                9103 non-null   object 
 10  SEX                         9103 non-null   int64  
 11  Sesso                       9103 non-null   object 
 12  AGE                         9103 non-null   object 
 13  Età                         9103 

In [6]:
# drop values for TERRITORY_NEXT_RESID = ITC2 (duplicate with Aosta province = ITC20)
df_imm = df_imm[df_imm['TERRITORY_NEXT_RESID'] != 'ITC2']

Select relavant columns and translate

In [7]:
keep = [
    'TIME_PERIOD',
    'Tipo di trasferimento',
    'Territorio di destinazione',
    'Osservazione'
]

df_imm = df_imm[keep]

new_name = {
    'TIME_PERIOD' : 'year',
    'Tipo di trasferimento' : 'from_where',
    'Territorio di destinazione' : 'prov',
    'Osservazione' : 'value'
}

df_imm = df_imm.rename(columns = new_name)

Convert province name to code

In [8]:
prov_to_keep = [
'\'L"\'Aquila\'',
'\'Reggio nell"\'Emilia\'',
'\'Valle d"\'Aosta / Vallée d"\'Aoste\'',
'Agrigento',
'Alessandria',
'Ancona',
'Arezzo',
'Ascoli Piceno',
'Asti',
'Avellino',
'Bari',
'Belluno',
'Benevento',
'Bergamo',
'Biella',
'Bologna',
'Bolzano / Bozen',
'Brescia',
'Brindisi',
'Cagliari',
'Caltanissetta',
'Campobasso',
'Caserta',
'Catania',
'Catanzaro',
'Chieti',
'Como',
'Cosenza',
'Cremona',
'Crotone',
'Cuneo',
'Enna',
'Ferrara',
'Firenze',
'Foggia',
'Forlì-Cesena',
'Frosinone',
'Genova',
'Gorizia',
'Grosseto',
'Imperia',
'Isernia',
'La Spezia',
'Latina',
'Lecce',
'Lecco',
'Livorno',
'Lodi',
'Lucca',
'Macerata',
'Mantova',
'Massa-Carrara',
'Matera',
'Messina',
'Milano',
'Modena',
'Napoli',
'Novara',
'Nuoro',
'Oristano',
'Padova',
'Palermo',
'Parma',
'Pavia',
'Perugia',
'Pesaro e Urbino',
'Pescara',
'Piacenza',
'Pisa',
'Pistoia',
'Pordenone',
'Potenza',
'Prato',
'Ragusa',
'Ravenna',
'Reggio di Calabria',
'Rieti',
'Rimini',
'Roma',
'Rovigo',
'Salerno',
'Sassari',
'Savona',
'Siena',
'Siracusa',
'Sondrio',
'Taranto',
'Teramo',
'Terni',
'Torino',
'Trapani',
'Trento',
'Treviso',
'Trieste',
'Udine',
'Varese',
'Venezia',
'Verbano-Cusio-Ossola',
'Vercelli',
'Verona',
'Vibo Valentia',
'Vicenza',
'Viterbo'
]

df_imm = df_imm[df_imm['prov'].isin(prov_to_keep)]

prov_new_name = {
 '\'L"\'Aquila\'' :  'AQ',
 '\'Reggio nell"\'Emilia\'' :  'RE',
 '\'Valle d"\'Aosta / Vallée d"\'Aoste\'' :  'AO',
 'Agrigento' : 'AG',
 'Alessandria' : 'AL',
 'Ancona' :  'AN',
 'Arezzo' : 'AR',
 'Ascoli Piceno' :  'AP',
 'Asti' : 'AT',
 'Avellino' : 'AV',
 'Bari' :  'BA',
 'Belluno' :  'BL',
 'Benevento' :  'BN',
 'Bergamo' :  'BG',
 'Biella' :  'BI',
 'Bologna' :  'BO',
 'Brescia' :  'BS',
 'Brindisi' :  'BR',
 'Bolzano / Bozen' : 'BZ',
 'Cagliari' :  'CA',
 'Caltanissetta' :  'CL',
 'Campobasso' :  'CB',
 'Caserta' :  'CE',
 'Catania' :  'CT',
 'Catanzaro' :  'CZ',
 'Chieti' : 'CH',
 'Como' : 'CO',
 'Cosenza' :  'CS',
 'Cremona' : 'CR',
 'Crotone' :  'KR',
 'Cuneo' :  'CN',
 'Enna' :  'EN',
 'Ferrara' : 'FE',
 'Firenze' : 'FI',
 'Foggia' : 'FG',
 'Forlì-Cesena' :  'FO',
 'Frosinone' :  'FR',
 'Genova' :  'GE',
 'Gorizia' : 'GO',
 'Grosseto' :  'GR',
 'Imperia' : 'IM',
 'Isernia' :  'IS',
 'La Spezia' : 'SP',
 'Latina' :  'LT',
 'Lecce' :  'LE',
 'Lecco' :  'LC',
 'Livorno' :  'LI',
 'Lodi' :  'LO',
 'Lucca' :  'LU',
 'Macerata' : 'MC',
 'Mantova' :  'MN',
 'Massa-Carrara' :  'MS',
 'Matera' : 'MT',
 'Messina' :  'ME',
 'Milano' :  'MI',
 'Modena' :  'MO',
 'Napoli' : 'NA',
 'Novara' : 'NO',
 'Nuoro' :  'NU',
 'Oristano' :  'OR',
 'Padova' :  'PD',
 'Palermo' : 'PA',
 'Parma' : 'PR',
 'Pavia' : 'PV',
 'Perugia' : 'PG',
 'Pesaro e Urbino' : 'PS',
 'Pescara' : 'PE',
 'Piacenza' :  'PC',
 'Pisa' :  'PI',
 'Pistoia' : 'PT',
 'Pordenone' :  'PN',
 'Potenza' :  'PZ',
 'Prato' :  'PO',
 'Ragusa' :  'RG',
 'Ravenna' :  'RA',
 'Reggio di Calabria' :  'RC',
 'Rieti' :  'RI',
 'Rimini' : 'RN',
 'Roma' :  'RM',
 'Rovigo' :  'RO',
 'Salerno' :  'SA',
 'Sassari' :  'SS',
 'Savona' : 'SV',
 'Siena' :  'SI',
 'Siracusa' :  'SR',
 'Sondrio' :  'SO',
 'Taranto' :  'TA',
 'Teramo' :  'TE',
 'Terni' :  'TR',
 'Torino' :  'TO',
 'Trapani' :  'TP',
 'Trento' : 'TN',
 'Treviso' : 'TV',
 'Trieste' :  'TS',
 'Udine' :  'UD',
 'Varese' :  'VA',
 'Venezia' :  'VE',
 'Verbano-Cusio-Ossola' :  'VB',
 'Vercelli' :  'VC',
 'Verona' :  'VR',
 'Vibo Valentia' :  'VV',
 'Vicenza' :  'VI',
 'Viterbo' :  'VT'
}

df_imm['prov'] = df_imm['prov'].replace(prov_new_name)

Select observations for out-of-province movements

In [9]:
where_to_keep = [
    'In altra provincia della stessa regione',
    'In altre regioni',
    'Estero'
]

df_imm = df_imm[df_imm['from_where'].isin(where_to_keep)]

where_new_name = {
    'In altra provincia della stessa regione' : 'region_in', # immigrants from same region but different province
    'In altre regioni' : 'nation_in', # immigrants from other regions
    'Estero' : 'abroad_in' # immigrants from abroad
}

df_imm['from_where'] = df_imm['from_where'].replace(where_new_name)

In [10]:
years = df_imm['year'].unique()

# Create a DataFrame with region_in = 0 for AO (only one province so no in region movement - across province of the same region)
ao_region_in = pd.DataFrame({
    'year': years,
    'prov': 'AO',
    'from_where': 'region_in',
    'value': 0
})

# Append to your main df_imm
df_imm = pd.concat([df_imm, ao_region_in], ignore_index=True)

Display internal and external movements for [prov, year]

In [11]:
df_piv_in = df_imm.pivot_table(
    index=['year','prov'],
    columns='from_where',
    values='value'
).reset_index()

# Combine internal migration
# Combine internal migration, treating missing region_in as 0
df_piv_in['nation_in'] = df_piv_in['nation_in'] + df_piv_in['region_in']

# Drop the region_in column if it exists
df_piv_in = df_piv_in.drop(columns=['region_in'], errors='ignore')

In [12]:
df_piv_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   year       1133 non-null   int64  
 1   prov       1133 non-null   object 
 2   abroad_in  1133 non-null   float64
 3   nation_in  1133 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 35.5+ KB


Load emigration data (https://esploradati.istat.it/databrowser/#/it/dw/categories/IT1,POP,1.0/POP_MIGRATIONS/DCIS_MIGRAZIONI/IT1,28_185_DF_DCIS_MIGRAZIONI_7,1.0)

In [13]:
folder = "C:/Users/HP/Desktop/Traineeship/data/emigration"

# Grab all CSV files
all_files = glob.glob(os.path.join(folder, "*.csv"))

Read .csv files and concatenate them in df_imm

In [14]:
dfs = []

for i,f in enumerate(all_files):
    
    # Read CSV, skip first title line
    df = pd.read_csv(f, sep=',')

    dfs.append(df)

In [15]:
df_emi = pd.concat(dfs, ignore_index=True)

In [16]:
df_emi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9092 entries, 0 to 9091
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   FREQ                             9092 non-null   object 
 1   Frequency                        9092 non-null   object 
 2   REF_AREA                         9092 non-null   object 
 3   Territory of previous residence  9092 non-null   object 
 4   DATA_TYPE                        9092 non-null   object 
 5   Indicator                        9092 non-null   object 
 6   CHANGE_OF_RESIDENCE              9092 non-null   object 
 7   Change of residence              9092 non-null   object 
 8   CITIZENSHIP                      9092 non-null   object 
 9   Citizenship (DESC)               9092 non-null   object 
 10  SEX                              9092 non-null   int64  
 11  Gender                           9092 non-null   object 
 12  AGE                 

In [17]:
# drop values for TERRITORY_NEXT_RESID = ITC2 (duplicate with Aosta province = ITC20)
df_emi = df_emi[df_emi['REF_AREA'] != 'ITC2']

In [18]:
sorted(df_emi['Territory of previous residence'].unique())

['\'L"\'Aquila\'',
 '\'Reggio nell"\'Emilia\'',
 '\'Valle d"\'Aosta / Vallée d"\'Aoste\'',
 'Abruzzo',
 'Agrigento',
 'Alessandria',
 'Ancona',
 'Arezzo',
 'Ascoli Piceno',
 'Asti',
 'Avellino',
 'Bari',
 'Barletta-Andria-Trani',
 'Basilicata',
 'Belluno',
 'Benevento',
 'Bergamo',
 'Biella',
 'Bologna',
 'Bolzano / Bozen',
 'Brescia',
 'Brindisi',
 'Cagliari',
 'Calabria',
 'Caltanissetta',
 'Campania',
 'Campobasso',
 'Carbonia-Iglesias',
 'Caserta',
 'Catania',
 'Catanzaro',
 'Centro (I)',
 'Chieti',
 'Como',
 'Cosenza',
 'Cremona',
 'Crotone',
 'Cuneo',
 'Emilia-Romagna',
 'Enna',
 'Fermo',
 'Ferrara',
 'Firenze',
 'Foggia',
 'Forlì-Cesena',
 'Friuli-Venezia Giulia',
 'Frosinone',
 'Genova',
 'Gorizia',
 'Grosseto',
 'Imperia',
 'Isernia',
 'Isole',
 'Italy',
 'La Spezia',
 'Latina',
 'Lazio',
 'Lecce',
 'Lecco',
 'Liguria',
 'Livorno',
 'Lodi',
 'Lombardia',
 'Lucca',
 'Macerata',
 'Mantova',
 'Marche',
 'Massa-Carrara',
 'Matera',
 'Medio Campidano',
 'Messina',
 'Mezzogiorno',
 

Select relevant columns and translate

In [19]:
to_keep = [
    'TIME_PERIOD',
    'Change of residence',
    'Territory of previous residence',
    'Observation'
]

df_emi = df_emi[to_keep]

new_name = {
    'TIME_PERIOD' : 'year',
    'Change of residence' : 'to_where',
    'Territory of previous residence' : 'prov',
    'Observation' : 'value'
}

df_emi = df_emi.rename(columns = new_name)

Convert province name to code

In [20]:
prov_new_name = {
 '\'L"\'Aquila\'' :  'AQ',
 '\'Reggio nell"\'Emilia\'' :  'RE',
 '\'Valle d"\'Aosta / Vallée d"\'Aoste\'' :  'AO',
 'Agrigento' : 'AG',
 'Alessandria' : 'AL',
 'Ancona' :  'AN',
 'Arezzo' : 'AR',
 'Ascoli Piceno' :  'AP',
 'Asti' : 'AT',
 'Avellino' : 'AV',
 'Bari' :  'BA',
 'Belluno' :  'BL',
 'Benevento' :  'BN',
 'Bergamo' :  'BG',
 'Biella' :  'BI',
 'Bologna' :  'BO',
 'Bolzano / Bozen' : 'BZ',
 'Brescia' :  'BS',
 'Brindisi' :  'BR',
 'Cagliari' :  'CA',
 'Caltanissetta' :  'CL',
 'Campobasso' :  'CB',
 'Caserta' :  'CE',
 'Catania' :  'CT',
 'Catanzaro' :  'CZ',
 'Chieti' : 'CH',
 'Como' : 'CO',
 'Cosenza' :  'CS',
 'Cremona' : 'CR',
 'Crotone' :  'KR',
 'Cuneo' :  'CN',
 'Enna' :  'EN',
 'Ferrara' : 'FE',
 'Firenze' : 'FI',
 'Foggia' : 'FG',
 'Forlì-Cesena' :  'FO',
 'Frosinone' :  'FR',
 'Genova' :  'GE',
 'Gorizia' : 'GO',
 'Grosseto' :  'GR',
 'Imperia' : 'IM',
 'Isernia' :  'IS',
 'La Spezia' : 'SP',
 'Latina' :  'LT',
 'Lecce' :  'LE',
 'Lecco' :  'LC',
 'Livorno' :  'LI',
 'Lodi' :  'LO',
 'Lucca' :  'LU',
 'Macerata' : 'MC',
 'Mantova' :  'MN',
 'Massa-Carrara' :  'MS',
 'Matera' : 'MT',
 'Messina' :  'ME',
 'Milano' :  'MI',
 'Modena' :  'MO',
 'Napoli' : 'NA',
 'Novara' : 'NO',
 'Nuoro' :  'NU',
 'Oristano' :  'OR',
 'Padova' :  'PD',
 'Palermo' : 'PA',
 'Parma' : 'PR',
 'Pavia' : 'PV',
 'Perugia' : 'PG',
 'Pesaro e Urbino' : 'PS',
 'Pescara' : 'PE',
 'Piacenza' :  'PC',
 'Pisa' :  'PI',
 'Pistoia' : 'PT',
 'Pordenone' :  'PN',
 'Potenza' :  'PZ',
 'Prato' :  'PO',
 'Ragusa' :  'RG',
 'Ravenna' :  'RA',
 'Reggio di Calabria' :  'RC',
 'Rieti' :  'RI',
 'Rimini' : 'RN',
 'Roma' :  'RM',
 'Rovigo' :  'RO',
 'Salerno' :  'SA',
 'Sassari' :  'SS',
 'Savona' : 'SV',
 'Siena' :  'SI',
 'Siracusa' :  'SR',
 'Sondrio' :  'SO',
 'Taranto' :  'TA',
 'Teramo' :  'TE',
 'Terni' :  'TR',
 'Torino' :  'TO',
 'Trapani' :  'TP',
 'Trento' : 'TN',
 'Treviso' : 'TV',
 'Trieste' :  'TS',
 'Udine' :  'UD',
 'Varese' :  'VA',
 'Venezia' :  'VE',
 'Verbano-Cusio-Ossola' :  'VB',
 'Vercelli' :  'VC',
 'Verona' :  'VR',
 'Vibo Valentia' :  'VV',
 'Vicenza' :  'VI',
 'Viterbo' :  'VT'
}

df_emi['prov'] = df_emi['prov'].replace(prov_new_name)

In [21]:
prov_to_keep = [
 'AQ',
 'RE',
 'AO',
 'AG',
 'AL',
 'AN',
 'AR',
 'AP',
 'AT',
 'AV',
 'BA',
 'BL',
 'BN',
 'BG',
 'BI',
 'BO',
 'BS',
 'BR',
 'BZ',
 'CA',
 'CL',
 'CB',
 'CE',
 'CT',
 'CZ',
 'CH',
 'CO',
 'CS',
 'CR',
 'KR',
 'CN',
 'EN',
 'FE',
 'FI',
 'FG',
 'FO',
 'FR',
 'GE',
 'GO',
 'GR',
 'IM',
 'IS',
 'SP',
 'LT',
 'LE',
 'LC',
 'LI',
 'LO',
 'LU',
 'MC',
 'MN',
 'MS',
 'MT',
 'ME',
 'MI',
 'MO',
 'NA',
 'NO',
 'NU',
 'OR',
 'PD',
 'PA',
 'PR',
 'PV',
 'PG',
 'PS',
 'PE',
 'PC',
 'PI',
 'PT',
 'PN',
 'PZ',
 'PO',
 'RG',
 'RA',
 'RC',
 'RI',
 'RN',
 'RM',
 'RO',
 'SA',
 'SS',
 'SV',
 'SI',
 'SR',
 'SO',
 'TA',
 'TE',
 'TN',
 'TR',
 'TO',
 'TP',
 'TV',
 'TS',
 'UD',
 'VA',
 'VE',
 'VB',
 'VC',
 'VR',
 'VV',
 'VI',
 'VT'
]

df_emi = df_emi[df_emi['prov'].isin(prov_to_keep)]

Select observations for out-of-province movements

In [22]:
where_to_keep = [
    'Different province of the same region',
    'Different regions',
    'Abroad'
]

df_emi = df_emi[df_emi['to_where'].isin(where_to_keep)]

where_new_name = {
    'Different province of the same region' : 'region_out', # emigrants to same region but different province
    'Different regions' : 'nation_out', # emigrants to other regions
    'Abroad' : 'abroad_out' # emigrants abroad
}

df_emi['to_where'] = df_emi['to_where'].replace(where_new_name)

In [23]:
years = df_emi['year'].unique()

# Create a DataFrame with region_out = 0 for AO
ao_region_out = pd.DataFrame({
    'year': years,
    'prov': 'AO',
    'to_where': 'region_out',
    'value': 0
})

# Append to your main df_imm
df_emi = pd.concat([df_emi, ao_region_out], ignore_index=True)

Display internal and external movements for [prov, year]

In [24]:
df_piv_out = (
    df_emi
        .pivot(index=['year', 'prov'], columns='to_where', values='value')
        .reset_index()          # <- removes BOTH 'year' and 'from_prov' from the index
)

# total number of emigrants within Italy
df_piv_out['nation_out'] = df_piv_out['nation_out'] + df_piv_out['region_out']

df_piv_out = df_piv_out.drop(columns = 'region_out')

In [25]:
df_piv_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        1133 non-null   int64 
 1   prov        1133 non-null   object
 2   abroad_out  1133 non-null   int64 
 3   nation_out  1133 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 35.5+ KB


Merge in and out datasets

In [26]:
df_merged = pd.merge(df_piv_in, df_piv_out, on = ['year','prov'], how = 'inner')

In [27]:
df_merged['prov'].nunique()

103

Save the dataset

In [28]:
df_merged.to_parquet('datasets/macro/prov_lag_pop_movements.parquet', index = False)