# Preprocessing Data

## Importing Helper Libraries

In [1]:
import pandas as pd
import numpy as np

## Importing Data

In [2]:
df=pd.read_csv('Global EV Data 2024.csv')
df

Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value,percentage
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350,"35000,00%"
1,Austria,Historical,EV stock share,Cars,EV,2010,percent,789.999.961.853,"78999996185300,00%"
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3,"300,00%"
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7,"700,00%"
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62,"6200,00%"
...,...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,percent,55,"5500,00%"
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,percent,31,"3100,00%"
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,charging points,9400000,"940000000,00%"
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,charging points,15000000,"1500000000,00%"


## Cleaning Data

In [3]:
#Capitalize columns names
df.columns=df.columns.str.capitalize()
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value,Percentage
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350,"35000,00%"
1,Austria,Historical,EV stock share,Cars,EV,2010,percent,789.999.961.853,"78999996185300,00%"
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3,"300,00%"
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7,"700,00%"
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62,"6200,00%"
...,...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,percent,55,"5500,00%"
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,percent,31,"3100,00%"
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,charging points,9400000,"940000000,00%"
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,charging points,15000000,"1500000000,00%"


In [4]:
#capitalize units names
df['Unit'] = df['Unit'].str.capitalize()
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value,Percentage
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350,"35000,00%"
1,Austria,Historical,EV stock share,Cars,EV,2010,Percent,789.999.961.853,"78999996185300,00%"
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3,"300,00%"
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7,"700,00%"
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62,"6200,00%"
...,...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55,"5500,00%"
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31,"3100,00%"
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000,"940000000,00%"
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000,"1500000000,00%"


In [5]:
# Drop 'Percentage' column
df.drop(['Percentage'], axis=1, inplace=True)
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350
1,Austria,Historical,EV stock share,Cars,EV,2010,Percent,789.999.961.853
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62
...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000


In [6]:
# Drop rows where 'Parameter' column has the value 'Oil displacement, million lge'
df.drop(df[df['Parameter']=='Oil displacement, million lge'].index,inplace=True)
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350
1,Austria,Historical,EV stock share,Cars,EV,2010,Percent,789.999.961.853
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62
...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000


In [7]:
# Drop rows where 'Parameter' column is 'Oil displacement Mbd'
df.drop(df[df['Parameter']=='Oil displacement Mbd'].index,inplace=True)
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350
1,Austria,Historical,EV stock share,Cars,EV,2010,Percent,789.999.961.853
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62
...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000


In [8]:
# Modify the 'Unit' column based on the 'Parameter' column
df['Unit'] = df.apply(lambda row: 'Shares' if row['Parameter'] == 'EV stock' else row['Unit'], axis=1)
df[df['Parameter']=='EV stock']

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Shares,350
2,Belgium,Historical,EV stock,Buses,BEV,2010,Shares,3
4,Belgium,Historical,EV stock,Vans,BEV,2010,Shares,62
6,Belgium,Historical,EV stock,Cars,BEV,2010,Shares,61
9,Belgium,Historical,EV stock,Cars,FCEV,2010,Shares,1
...,...,...,...,...,...,...,...,...
12631,World,Projection-STEPS,EV stock,Vans,FCEV,2035,Shares,980000
12632,World,Projection-STEPS,EV stock,Vans,PHEV,2035,Shares,1400000
12633,World,Projection-STEPS,EV stock,Cars,BEV,2035,Shares,390000000
12634,World,Projection-STEPS,EV stock,Cars,FCEV,2035,Shares,1200000


In [9]:
# Convert the 'Value' column to numeric, coercing errors to NaN
df['Value']=pd.to_numeric(df['Value'],errors='coerce')
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Shares,350.0
1,Austria,Historical,EV stock share,Cars,EV,2010,Percent,
2,Belgium,Historical,EV stock,Buses,BEV,2010,Shares,3.0
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7.0
4,Belgium,Historical,EV stock,Vans,BEV,2010,Shares,62.0
...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55.0
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31.0
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000.0
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000.0


In [10]:
# Drop rows with missing values
df.dropna(inplace=True)
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Shares,350.0
2,Belgium,Historical,EV stock,Buses,BEV,2010,Shares,3.0
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7.0
4,Belgium,Historical,EV stock,Vans,BEV,2010,Shares,62.0
5,Belgium,Historical,EV sales,Cars,BEV,2010,Vehicles,48.0
...,...,...,...,...,...,...,...,...
12649,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55.0
12650,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31.0
12651,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000.0
12652,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000.0


In [11]:
# Reset the index
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Region,Category,Parameter,Mode,Powertrain,Year,Unit,Value
0,Austria,Historical,EV stock,Cars,BEV,2010,Shares,350.0
1,Belgium,Historical,EV stock,Buses,BEV,2010,Shares,3.0
2,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7.0
3,Belgium,Historical,EV stock,Vans,BEV,2010,Shares,62.0
4,Belgium,Historical,EV sales,Cars,BEV,2010,Vehicles,48.0
...,...,...,...,...,...,...,...,...
8890,World,Projection-STEPS,EV sales share,Cars,EV,2035,Percent,55.0
8891,World,Projection-STEPS,EV stock share,Cars,EV,2035,Percent,31.0
8892,World,Projection-APS,EV charging points,EV,Publicly available fast,2035,Charging points,9400000.0
8893,World,Projection-APS,EV charging points,EV,Publicly available slow,2035,Charging points,15000000.0
