# Data cleaning

In [None]:
# Importing the necessary libraries
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [350]:
# Load database with number of passenger vehicles per country per year
database_1 = pd.read_csv('data/road_eqs_carpda_page_linear.csv')

# Load database with number of zero emmision vehicles per country per year
database_2 = pd.read_csv('data/road_eqs_zev_page_linear.csv')

# Load database with emmisions per country per year
database_3 = pd.read_csv('data/sdg_13_10_page_linear_2.csv')

# Load database with populatoins per country per year
database_4 = pd.read_csv('data/population_1january.csv')

In [351]:
database_4.columns

database_4 = database_4[['geo','TIME_PERIOD', 'OBS_VALUE']]
database_4 = database_4.rename(columns={'OBS_VALUE':'population'})


In [352]:
# Store the unique countries from each database in sets
set1 = set(database_1.geo.unique())
set2 = set(database_2.geo.unique())
set3 = set(database_3.geo.unique())
set4 = set(database_4.geo.unique())

In [353]:
# Identify which countries are in all three databases
countries = set1.intersection(set2, set3, set4)

In [354]:
joined_df_1 = pd.merge(database_1, database_2, how='right', on=['geo', 'TIME_PERIOD'])
joined_df_2 = pd.merge(joined_df_1, database_3, how='left', on=['geo', 'TIME_PERIOD'])
joined_df = pd.merge(joined_df_2, database_4, how='left', on=['geo', 'TIME_PERIOD'])

In [355]:
# Rename columns
joined_df = joined_df.rename(columns={'OBS_VALUE_y':'amount_zev', 'OBS_VALUE' : 'emission_per_capita', 'OBS_VALUE_x' : 'amount_passenger_cars', 'TIME_PERIOD' : 'time_period'})
#joined_df['amount_zev'].head()

In [356]:
# Transforming the abbrevation to the full country names
countries_map = {'AT' : 'Austria',
 'BE' : 'Belgium',
 'BG' : 'Bulgaria',
 'CY' : 'Cyprus',
 'CZ' : 'Czech Republic',
 'DE' : 'Germany',
 'DK': 'Denmark',
 'EE' : 'Estonia',
 'EL' : 'Greece',
 'ES' : 'Spain',
 'EU27_2020' : 'European Union',
 'FI' : 'Finland',
 'FR' : 'France',
 'HR' : 'Croatia',
 'HU' : 'Hungary',
 'IE' :'Ireland',
 'IT' : 'Italy',
 'LT' : 'Lithuania',
 'LU' : 'Luxembourg',
 'LV' : 'Latvia',
 'MT' : 'Malta',
 'NL' : 'Netherlands',
 'PL' : 'Poland',
 'PT': 'Portugal',
 'RO' : 'Romania',
 'SE' : 'Sweden',
 'SI' : 'Slovenia',
 'SK' : 'Slovakia'}

joined_df['geo'] = joined_df['geo'].map(countries_map)

In [None]:
# Construct new columns with share of zero emmission vehicles and amount of emmission vehicles
joined_df['share_zev'] = joined_df['amount_zev'] / joined_df['amount_passenger_cars']
joined_df['amount_ev'] = joined_df['amount_passenger_cars'] - joined_df['amount_zev']
joined_df['zev_per_capita'] = joined_df['amount_zev'] / joined_df['population']
joined_df['ev_per_capita'] = joined_df['amount_ev'] / joined_df['population']
joined_df['total_emissions'] = joined_df['emission_per_capita'] * joined_df['population']


In [358]:
# Select only the relevant columns from the merged dataset
joined_df = joined_df[['geo', 'time_period', 'amount_passenger_cars', 'amount_zev', 'amount_ev', 'share_zev', 'emission_per_capita', 'zev_per_capita', 'ev_per_capita', 'total_emissions']]
joined_df

Unnamed: 0,geo,time_period,amount_passenger_cars,amount_zev,amount_ev,share_zev,emission_per_capita,zev_per_capita,ev_per_capita,total_emissions
0,Austria,2014,4694921,3389,4691532,0.000722,8.4,0.000398,0.551440,71465402.4
1,Austria,2015,4748048,5038,4743010,0.001061,8.7,0.000587,0.552481,74688856.2
2,Austria,2016,4821557,9086,4812471,0.001884,8.6,0.001044,0.553128,74824050.6
3,Austria,2017,4898578,14637,4883941,0.002988,9.3,0.001668,0.556710,81587644.5
4,Austria,2018,4978852,20855,4957997,0.004189,9.8,0.002364,0.561987,86458216.6
...,...,...,...,...,...,...,...,...,...,...
275,Slovakia,2019,2393577,956,2392621,0.000399,6.4,0.000175,0.438979,34882694.4
276,Slovakia,2020,2439986,1863,2438123,0.000764,5.5,0.000341,0.446717,30018301.5
277,Slovakia,2021,2493183,3001,2490182,0.001204,6.3,0.000550,0.456096,34396620.3
278,Slovakia,2022,2555491,4531,2550960,0.001773,5.5,0.000834,0.469383,29890916.0


In [None]:
joined_df = joined_df.sort_values(by=['geo', 'time_period'])

# Obtain difference with previous year for all variables for each country
joined_df['zev_diff'] = joined_df.groupby('geo')['amount_zev'].transform(lambda x: x - x.shift(1))
joined_df['ev_diff'] = joined_df.groupby('geo')['amount_ev'].transform(lambda x: x - x.shift(1))
joined_df['emmission_diff'] = joined_df.groupby('geo')['total_emissions'].transform(lambda x: x - x.shift(1))
joined_df['all_cars_diff'] = joined_df.groupby('geo')['amount_passenger_cars'].transform(lambda x: x - x.shift(1))

# Relative differences data:
joined_df['zev_norm'] = joined_df.groupby('geo')['amount_zev'].transform(lambda x: (x - x.shift(1))/x.shift(1))
joined_df['ev_norm'] = joined_df.groupby('geo')['amount_ev'].transform(lambda x: (x - x.shift(1))/x.shift(1))
joined_df['emmission_norm'] = joined_df.groupby('geo')['total_emissions'].transform(lambda x: (x - x.shift(1))/x.shift(1))
joined_df['all_cars_norm'] = joined_df.groupby('geo')['amount_passenger_cars'].transform(lambda x: (x - x.shift(1))/x.shift(1))

In [361]:
joined_df

Unnamed: 0,geo,time_period,amount_passenger_cars,amount_zev,amount_ev,share_zev,emission_per_capita,zev_per_capita,ev_per_capita,total_emissions,zev_diff,ev_diff,emmission_diff,all_cars_diff
0,Austria,2014,4694921,3389,4691532,0.000722,8.4,0.000398,0.551440,71465402.4,,,,
1,Austria,2015,4748048,5038,4743010,0.001061,8.7,0.000587,0.552481,74688856.2,1649.0,51478.0,3223453.8,53127.0
2,Austria,2016,4821557,9086,4812471,0.001884,8.6,0.001044,0.553128,74824050.6,4048.0,69461.0,135194.4,73509.0
3,Austria,2017,4898578,14637,4883941,0.002988,9.3,0.001668,0.556710,81587644.5,5551.0,71470.0,6763593.9,77021.0
4,Austria,2018,4978852,20855,4957997,0.004189,9.8,0.002364,0.561987,86458216.6,6218.0,74056.0,4870572.1,80274.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,Sweden,2019,4887116,30382,4856734,0.006217,1.2,0.002970,0.474745,12276222.0,13677.0,3460.0,-3916165.2,17137.0
256,Sweden,2020,4943293,55829,4887464,0.011294,0.4,0.005406,0.473243,4131035.6,25447.0,30730.0,-8145186.4,56177.0
257,Sweden,2021,4985979,110221,4875758,0.022106,0.5,0.010619,0.469758,5189647.5,54392.0,-11706.0,1058611.9,42686.0
258,Sweden,2022,4979761,197751,4782010,0.039711,0.6,0.018919,0.457507,6271395.6,87530.0,-93748.0,1081748.1,-6218.0


In [None]:
joined_df.to_csv('../data/merged_dataset.csv') #Exporting the dataset