In [1]:
import pandas as pd
import glob

In [2]:
# get data file names
path =r'./data/population'
filenames = glob.glob(path + "/*.csv")
print(filenames)

['./data/population/2020_padro_nacionalitat_per_sexe.csv', './data/population/2018_padro_nacionalitat_per_sexe.csv', './data/population/2017_padro_nacionalitat_per_sexe.csv', './data/population/2016_padro_nacionalitat_per_sexe.csv', './data/population/2019_padro_nacionalitat_per_sexe.csv', './data/population/2015_padro_nacionalitat_per_sexe.csv']


In [3]:
# read all the *.csv inside population/
all_pop_dfs = []
for file in filenames:
    df_per_year = pd.read_csv(file)
    all_pop_dfs.append(df_per_year)
    print(f'Size of {file}: {df_per_year.shape}')

Size of ./data/population/2020_padro_nacionalitat_per_sexe.csv: (8176, 8)
Size of ./data/population/2018_padro_nacionalitat_per_sexe.csv: (7446, 8)
Size of ./data/population/2017_padro_nacionalitat_per_sexe.csv: (7300, 8)
Size of ./data/population/2016_padro_nacionalitat_per_sexe.csv: (7154, 8)
Size of ./data/population/2019_padro_nacionalitat_per_sexe.csv: (7592, 8)
Size of ./data/population/2015_padro_nacionalitat_per_sexe.csv: (7252, 8)


In [4]:
# make a super-dataframe with all the years
pop_df = pd.concat(all_pop_dfs, axis=0, ignore_index=True)
print(pop_df.shape)
pop_df.head()

(44920, 8)


Unnamed: 0,Any,Codi_Districte,Nom_Districte,Codi_Barri,Nom_Barri,Sexe,Nacionalitat,Nombre
0,2020,1,Ciutat Vella,1,el Raval,Dona,Espanya,11405
1,2020,1,Ciutat Vella,2,el Barri Gòtic,Dona,Espanya,4141
2,2020,1,Ciutat Vella,3,la Barceloneta,Dona,Espanya,4730
3,2020,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Dona,Espanya,6468
4,2020,2,Eixample,5,el Fort Pienc,Dona,Espanya,12900


In [5]:
# rename the columns
pop_df = pop_df.rename(columns={'Any':'year', 
                   'Codi_Districte':'district_code', 
                   'Nom_Districte':'district_name', 
                   'Codi_Barri':'neighbourhood_code', 
                   'Nom_Barri':'neighbourhood_name', 
                   'Sexe':'sex', 
                   'Nacionalitat':'nationality',
                   'Nombre': 'number'})
pop_df.head()

Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name,sex,nationality,number
0,2020,1,Ciutat Vella,1,el Raval,Dona,Espanya,11405
1,2020,1,Ciutat Vella,2,el Barri Gòtic,Dona,Espanya,4141
2,2020,1,Ciutat Vella,3,la Barceloneta,Dona,Espanya,4730
3,2020,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Dona,Espanya,6468
4,2020,2,Eixample,5,el Fort Pienc,Dona,Espanya,12900


In [6]:
pop_df = pop_df.assign(sex= lambda df: 
                         df['sex'].map({'Dona':'women','Home':'men'}))
pop_df.head()

Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name,sex,nationality,number
0,2020,1,Ciutat Vella,1,el Raval,women,Espanya,11405
1,2020,1,Ciutat Vella,2,el Barri Gòtic,women,Espanya,4141
2,2020,1,Ciutat Vella,3,la Barceloneta,women,Espanya,4730
3,2020,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",women,Espanya,6468
4,2020,2,Eixample,5,el Fort Pienc,women,Espanya,12900


In [7]:
nationalities_dict = {'Espanya':'Spain', 'Itàlia':'Italy', 'França':'France', 'Regne Unit':'United Kingdom', 'Rússia':'Russia', 'Alemanya':'Germany',
                      'Ucraïna':'Ukraine', 'Polònia':'Poland', 'Països Baixos':'the Netherlands', 'Bulgària':'Bulgaria', 'Suècia':'Sweden',
                      'Bèlgica':'Belgium', 'Irlanda':'Ireland', 'Grècia':'Greece', 'Suïssa':'Switzerland', 'Hongria':'Hungary', 
                      "Resta països d'Europa":'Other European countries', 'Marroc, el': 'Morocco', 'Algèria':'Argelia', 'Nigèria':'Nigeria',
                      "Resta països d'Àfrica":'Other African countries', 'Colòmbia':'Colombia', 'Hondures':'Honduras', 'Veneçuela':'Venezuela',
                      'Perú':'Peru', 'Bolívia':'Bolivia', 'Brasil':'Brazil', 'Equador':'Ecuador', 'República Dominicana':'Dominican Republic',
                      'Estats Units, els':'The United States', 'Mèxic':'Mexico', 'Xile':'Chile', 'Paraguai':'Paraguay', 'Uruguai':'Uruguay',
                      "Resta països d'Àmerica":'Other American countries', 'Xina':'China', 'Filipines':'the Philippines', 'Índia':'India',
                      'Geòrgia':'Georgia', 'Armènia':'Armenia', 'Japó':'Japan', 'Turquia':'Turkey', "Resta països d'Àsia":'Other Asian countries',
                      'Apàtrides / No consta':'stateless/not stated', "Resta països d'Amèrica":'Other American countries', 
                      'Resta països Europa':'Other European countries', 'Resta països Àfrica':'Other African countries', 
                      'Resta països Amèrica':'Other American countries', 'Resta països Àsia':'Other Asian countries','Apàtrides/No consta':'stateless/not stated',
                      'Països sense relació diplomàtica':'countries withouth diplomatic relationship', 'Apàtrides':'stateless/not stated'}
                      
pop_df = pop_df.assign(nationality= lambda df: 
                         df['nationality'].map(nationalities_dict))
pop_df.head()

Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name,sex,nationality,number
0,2020,1,Ciutat Vella,1,el Raval,women,Spain,11405
1,2020,1,Ciutat Vella,2,el Barri Gòtic,women,Spain,4141
2,2020,1,Ciutat Vella,3,la Barceloneta,women,Spain,4730
3,2020,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",women,Spain,6468
4,2020,2,Eixample,5,el Fort Pienc,women,Spain,12900


In [8]:
pop_by_neighbourhood_year = pop_df.groupby(by=['year', 'neighbourhood_name']).sum().reset_index().drop(columns=['district_code', 'neighbourhood_code'])
pop_by_neighbourhood_year

Unnamed: 0,year,neighbourhood_name,number
0,2015,Baró de Viver,2482
1,2015,Can Baró,8938
2,2015,Can Peguera,2267
3,2015,Canyelles,6946
4,2015,Ciutat Meridiana,10156
...,...,...,...
434,2020,la Vila Olímpica del Poblenou,9385
435,2020,la Vila de Gràcia,50926
436,2020,les Corts,46731
437,2020,les Roquetes,16417


In [9]:
districts_and_neighbourhoods_year = pop_df[['year', 'district_code', 'district_name', 'neighbourhood_code', 'neighbourhood_name']].drop_duplicates()
districts_and_neighbourhoods_year

Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name
0,2020,1,Ciutat Vella,1,el Raval
1,2020,1,Ciutat Vella,2,el Barri Gòtic
2,2020,1,Ciutat Vella,3,la Barceloneta
3,2020,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera"
4,2020,2,Eixample,5,el Fort Pienc
...,...,...,...,...,...
37737,2015,10,Sant Martí,70,el Besòs i el Maresme
37738,2015,10,Sant Martí,71,Provençals del Poblenou
37739,2015,10,Sant Martí,72,Sant Martí de Provençals
37740,2015,10,Sant Martí,73,la Verneda i la Pau


In [10]:
pop_by_neighbourhood_year = pop_by_neighbourhood_year.merge(districts_and_neighbourhoods_year, on=('neighbourhood_name', 'year'), how='outer')
pop_by_neighbourhood_year

Unnamed: 0,year,neighbourhood_name,number,district_code,district_name,neighbourhood_code
0,2015,Baró de Viver,2482,9,Sant Andreu,58
1,2015,Can Baró,8938,7,Horta-Guinardó,34
2,2015,Can Peguera,2267,8,Nou Barris,47
3,2015,Canyelles,6946,8,Nou Barris,49
4,2015,Ciutat Meridiana,10156,8,Nou Barris,55
...,...,...,...,...,...,...
434,2020,la Vila Olímpica del Poblenou,9385,10,Sant Martí,67
435,2020,la Vila de Gràcia,50926,6,Gràcia,31
436,2020,les Corts,46731,4,Les Corts,19
437,2020,les Roquetes,16417,8,Nou Barris,50


In [11]:
pop_by_neighbourhood_year = pop_by_neighbourhood_year.rename(columns={'number':'population'})
pop_by_neighbourhood_year = pop_by_neighbourhood_year.reindex(columns=['year', 'district_code', 'district_name', 'neighbourhood_code', 'neighbourhood_name', 'population'])
pop_by_neighbourhood_year.head()

Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name,population
0,2015,9,Sant Andreu,58,Baró de Viver,2482
1,2015,7,Horta-Guinardó,34,Can Baró,8938
2,2015,8,Nou Barris,47,Can Peguera,2267
3,2015,8,Nou Barris,49,Canyelles,6946
4,2015,8,Nou Barris,55,Ciutat Meridiana,10156


In [13]:
pop_by_neighbourhood_year.to_csv('./data/cleaned/population/population.csv', index=False)