In [2]:
import pandas as pd
import glob

In [3]:
# get data file names
path =r'./data/academic_level/'
filenames = glob.glob(path + "/*.csv")
print(filenames)

['./data/academic_level/2018_padro_nivell_academic_per_sexe.csv', './data/academic_level/2015_padro_nivell_academic_per_sexe.csv', './data/academic_level/2020_padro_nivell_academic_per_sexe.csv', './data/academic_level/2017_padro_nivell_academic_per_sexe.csv', './data/academic_level/2019_padro_nivell_academic_per_sexe.csv', './data/academic_level/2016_padro_nivell_academic_per_sexe.csv']


In [4]:
# read all the *.csv inside academic_level/
all_rent_dfs = []
for file in filenames:
    df_per_year = pd.read_csv(file)
    all_rent_dfs.append(df_per_year)
    print(f'Size of {file}: {df_per_year.shape}')
    print(f'Columns of {file}: {df_per_year.columns.to_list()}')

Size of ./data/academic_level/2018_padro_nivell_academic_per_sexe.csv: (876, 8)
Columns of ./data/academic_level/2018_padro_nivell_academic_per_sexe.csv: ['Any', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_Barri', 'Sexe', 'Nivell_academic', 'Nombre']
Size of ./data/academic_level/2015_padro_nivell_academic_per_sexe.csv: (888, 8)
Columns of ./data/academic_level/2015_padro_nivell_academic_per_sexe.csv: ['Any', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_Barri', 'Sexe', 'Nivell_academic', 'Nombre']
Size of ./data/academic_level/2020_padro_nivell_academic_per_sexe.csv: (876, 8)
Columns of ./data/academic_level/2020_padro_nivell_academic_per_sexe.csv: ['Any', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_Barri', 'Sexe', 'Nivell_academic', 'Nombre']
Size of ./data/academic_level/2017_padro_nivell_academic_per_sexe.csv: (876, 8)
Columns of ./data/academic_level/2017_padro_nivell_academic_per_sexe.csv: ['Any', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_B

In [5]:
# one of the datasets have a different column name (Nivell acadèmic instead of Nivell_academic)
all_rent_dfs[len(all_rent_dfs) - 1] = all_rent_dfs[len(all_rent_dfs) - 1].rename({'Nivell acadèmic':'Nivell_academic'}, axis=1)
all_rent_dfs[len(all_rent_dfs) - 1]

Unnamed: 0,Any,Codi_Districte,Nom_Districte,Codi_Barri,Nom_Barri,Sexe,Nivell_academic,Nombre
0,2016,1,Ciutat Vella,1,el Raval,Home,Sense estudis,301
1,2016,1,Ciutat Vella,2,el Barri Gòtic,Home,Sense estudis,100
2,2016,1,Ciutat Vella,3,la Barceloneta,Home,Sense estudis,181
3,2016,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Home,Sense estudis,120
4,2016,2,Eixample,5,el Fort Pienc,Home,Sense estudis,140
...,...,...,...,...,...,...,...,...
871,2016,10,Sant Martí,69,Diagonal Mar i el Front Marítim del Poblenou,Dona,No consta,71
872,2016,10,Sant Martí,70,el Besòs i el Maresme,Dona,No consta,170
873,2016,10,Sant Martí,71,Provençals del Poblenou,Dona,No consta,119
874,2016,10,Sant Martí,72,Sant Martí de Provençals,Dona,No consta,147


In [6]:
# make a super-dataframe with all the years
academic_level_df = pd.concat(all_rent_dfs, axis=0, ignore_index=True)
print(academic_level_df.shape)
academic_level_df.head()

(5268, 8)


Unnamed: 0,Any,Codi_Districte,Nom_Districte,Codi_Barri,Nom_Barri,Sexe,Nivell_academic,Nombre
0,2018,1,Ciutat Vella,1,el Raval,Home,Sense estudis,235
1,2018,1,Ciutat Vella,2,el Barri Gòtic,Home,Sense estudis,84
2,2018,1,Ciutat Vella,3,la Barceloneta,Home,Sense estudis,132
3,2018,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Home,Sense estudis,93
4,2018,2,Eixample,5,el Fort Pienc,Home,Sense estudis,108


In [7]:
# translate the columns
academic_level_df = academic_level_df.rename(columns={'Any':'year',
                   'Codi_Districte':'district_code', 
                   'Nom_Districte':'district_name', 
                   'Codi_Barri':'neighborhood_code', 
                   'Nom_Barri':'neighborhood_name', 
                   'Sexe':'sex',
                   'Nivell_academic':'academic_level',
                   'Nombre':'number' })
academic_level_df.head()

Unnamed: 0,year,district_code,district_name,neighborhood_code,neighborhood_name,sex,academic_level,number
0,2018,1,Ciutat Vella,1,el Raval,Home,Sense estudis,235
1,2018,1,Ciutat Vella,2,el Barri Gòtic,Home,Sense estudis,84
2,2018,1,Ciutat Vella,3,la Barceloneta,Home,Sense estudis,132
3,2018,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Home,Sense estudis,93
4,2018,2,Eixample,5,el Fort Pienc,Home,Sense estudis,108


In [8]:
# translate the sex and academic_level categories
print(f"Unique values for sex: {academic_level_df['sex'].unique().tolist()}")
print(f"Unique values for academic level: {academic_level_df['academic_level'].unique().tolist()}")

Unique values for sex: ['Home', 'Dona']
Unique values for academic level: ['Sense estudis', "Estudis primaris / certificat d'escolaritat / EGB", 'Batxillerat elemental / graduat escolar / ESO / FPI', 'Batxillerat superior / BUP / COU / FPII / CFGM grau mitjà', 'Estudis universitaris / CFGS grau superior', 'No consta']


In [9]:
academic_level_df = academic_level_df.assign(sex=lambda df: df['sex'].map({'Home':'man', 'Dona':'woman'}))
academic_level_df['sex'].unique()

array(['man', 'woman'], dtype=object)

In [10]:
academic_level_df = academic_level_df.assign(academic_level= lambda df: df['academic_level'].map({"Sense estudis":"without education",
                                                                                                  "Estudis primaris / certificat d'escolaritat / EGB":"elementary_school",
                                                                                                  "Batxillerat elemental / graduat escolar / ESO / FPI":"junior_high_school",
                                                                                                  "Batxillerat superior / BUP / COU / FPII / CFGM grau mitjà":"senior_high_school",
                                                                                                  "Estudis universitaris / CFGS grau superior":"bachelors_degree",
                                                                                                  "No consta":"unknown"}))
academic_level_df['academic_level'].unique()

array(['without education', 'elementary_school', 'junior_high_school',
       'senior_high_school', 'bachelors_degree', 'unknown'], dtype=object)

In [11]:
# check for NaNs
academic_level_df.isna().sum()

year                 0
district_code        0
district_name        0
neighborhood_code    0
neighborhood_name    0
sex                  0
academic_level       0
number               0
dtype: int64

In [12]:
# set the table to have one row for year and neighborhood
academic_level_df = pd.pivot_table(academic_level_df, values='number', columns=['sex', 'academic_level'], index=['year', 'district_code', 'district_name', 'neighborhood_code', 'neighborhood_name'])
academic_level_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sex,man,man,man,man,man,man,woman,woman,woman,woman,woman,woman
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,academic_level,bachelors_degree,elementary_school,junior_high_school,senior_high_school,unknown,without education,bachelors_degree,elementary_school,junior_high_school,senior_high_school,unknown,without education
year,district_code,district_name,neighborhood_code,neighborhood_name,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2015,1,Ciutat Vella,1,el Raval,4455,9031,4966,3909,5,443,4563,5932,3732,3300,7,880
2015,1,Ciutat Vella,2,el Barri Gòtic,2534,1567,1323,1713,0,124,2721,1170,977,1515,1,268
2015,1,Ciutat Vella,3,la Barceloneta,1560,1782,1726,1438,2,244,1839,1786,1444,1259,1,553
2015,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",3480,2225,1807,2263,2,190,4002,2020,1464,2067,5,470
2015,2,Eixample,5,el Fort Pienc,4637,1767,2341,3808,2,290,5462,2490,2690,3655,3,571


In [13]:
# reset the multi-index
academic_level_df = academic_level_df.reset_index()
academic_level_df.head()

sex,year,district_code,district_name,neighborhood_code,neighborhood_name,man,man,man,man,man,man,woman,woman,woman,woman,woman,woman
academic_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,bachelors_degree,elementary_school,junior_high_school,senior_high_school,unknown,without education,bachelors_degree,elementary_school,junior_high_school,senior_high_school,unknown,without education
0,2015,1,Ciutat Vella,1,el Raval,4455,9031,4966,3909,5,443,4563,5932,3732,3300,7,880
1,2015,1,Ciutat Vella,2,el Barri Gòtic,2534,1567,1323,1713,0,124,2721,1170,977,1515,1,268
2,2015,1,Ciutat Vella,3,la Barceloneta,1560,1782,1726,1438,2,244,1839,1786,1444,1259,1,553
3,2015,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",3480,2225,1807,2263,2,190,4002,2020,1464,2067,5,470
4,2015,2,Eixample,5,el Fort Pienc,4637,1767,2341,3808,2,290,5462,2490,2690,3655,3,571


In [14]:
# reset the multi-columns

new_columns= []

for level1, level2 in academic_level_df.columns:
    if level2:
        new_columns.append(level1 + "_" + level2)
    else:
        new_columns.append(level1)
new_columns

['year',
 'district_code',
 'district_name',
 'neighborhood_code',
 'neighborhood_name',
 'man_bachelors_degree',
 'man_elementary_school',
 'man_junior_high_school',
 'man_senior_high_school',
 'man_unknown',
 'man_without education',
 'woman_bachelors_degree',
 'woman_elementary_school',
 'woman_junior_high_school',
 'woman_senior_high_school',
 'woman_unknown',
 'woman_without education']

In [15]:
academic_level_df = academic_level_df.droplevel(level=0, axis=1)
academic_level_df.head()

academic_level,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,bachelors_degree,elementary_school,junior_high_school,senior_high_school,unknown,without education,bachelors_degree.1,elementary_school.1,junior_high_school.1,senior_high_school.1,unknown.1,without education.1
0,2015,1,Ciutat Vella,1,el Raval,4455,9031,4966,3909,5,443,4563,5932,3732,3300,7,880
1,2015,1,Ciutat Vella,2,el Barri Gòtic,2534,1567,1323,1713,0,124,2721,1170,977,1515,1,268
2,2015,1,Ciutat Vella,3,la Barceloneta,1560,1782,1726,1438,2,244,1839,1786,1444,1259,1,553
3,2015,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",3480,2225,1807,2263,2,190,4002,2020,1464,2067,5,470
4,2015,2,Eixample,5,el Fort Pienc,4637,1767,2341,3808,2,290,5462,2490,2690,3655,3,571


In [16]:
academic_level_df.columns = new_columns
academic_level_df.head()

Unnamed: 0,year,district_code,district_name,neighborhood_code,neighborhood_name,man_bachelors_degree,man_elementary_school,man_junior_high_school,man_senior_high_school,man_unknown,man_without education,woman_bachelors_degree,woman_elementary_school,woman_junior_high_school,woman_senior_high_school,woman_unknown,woman_without education
0,2015,1,Ciutat Vella,1,el Raval,4455,9031,4966,3909,5,443,4563,5932,3732,3300,7,880
1,2015,1,Ciutat Vella,2,el Barri Gòtic,2534,1567,1323,1713,0,124,2721,1170,977,1515,1,268
2,2015,1,Ciutat Vella,3,la Barceloneta,1560,1782,1726,1438,2,244,1839,1786,1444,1259,1,553
3,2015,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",3480,2225,1807,2263,2,190,4002,2020,1464,2067,5,470
4,2015,2,Eixample,5,el Fort Pienc,4637,1767,2341,3808,2,290,5462,2490,2690,3655,3,571


In [17]:
academic_level_df.to_csv('./data/cleaned/academic_level.csv', index=False)