# EDA - World EV Charging Stations

## Data Preparation 

### Importing Libraries & Setting Up Data

In [34]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from matplotlib import rcParams
import warnings
warnings.filterwarnings("ignore")

In [4]:
address = 'C:/Users/west5/OneDrive - Deakin University/Desktop/Git_Repositories_Working/EVCFLO/Data Science - EV Stations/Data/IEA-EV-data.csv'

ev = pd.read_csv(address)

ev.columns = ['region', 'category', 'parameter', 'mode', 'powertrain', 'year', 'unit', 'value']
ev.head()

Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value
0,Australia,Historical,EV sales,Cars,BEV,2011,sales,49.0
1,Australia,Historical,EV stock,Cars,BEV,2011,stock,49.0
2,Australia,Historical,EV stock share,Cars,EV,2011,percent,0.00046
3,Australia,Historical,EV sales share,Cars,EV,2011,percent,0.0065
4,Australia,Historical,EV sales share,Cars,EV,2012,percent,0.03


### Treating Duplicates & Missing Values

In [22]:
# Count Missing Values
print('Missing Values by Attribute:')
print(ev.isnull().sum())

# Filter Out Missing Values
ev_no_nan = ev.dropna(axis=0)
ev_no_nan2 = ev.dropna(axis=1)
print('\nShape before filtering missing values: ' + str(ev.shape))
print('Shape after filtering missing values by row: ' + str(ev_no_nan.shape))
print('Shape after filtering missing values by column: ' + str(ev_no_nan2.shape))

Missing Values by Attribute:
region        0
category      0
parameter     0
mode          0
powertrain    0
year          0
unit          0
value         0
dtype: int64

Shape before filtering missing values: (7434, 8)
Shape after filtering missing values by row: (7434, 8)
Shape after filtering missing values by column: (7434, 8)


There are no missing values in the dataframe by either row or column. Therefore the shape before and after filtering is the same. No missing data is detected. 

In [29]:
# Counting and dropping duplicated Data Values
counted = ev.duplicated().sum()
print('Number of duplicated data values: ' + str(counted))
print('Shape before dropping duplicates: ' + str(ev.shape))
ev2 = ev.drop_duplicates()
print('Shape after dropping duplicates: ' + str(ev2.shape))

Number of duplicated data values: 0
Shape before dropping duplicates: (7434, 8)
Shape after dropping duplicates: (7434, 8)


There are no duplicated rows detected in the data frame. As such the shape is the before and after dropping duplicates. 

### Sorting, Selecting & Grouping Data 

In [59]:
ev.head()

Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value
0,Australia,Historical,EV sales,Cars,BEV,2011,sales,49.0
1,Australia,Historical,EV stock,Cars,BEV,2011,stock,49.0
2,Australia,Historical,EV stock share,Cars,EV,2011,percent,0.00046
3,Australia,Historical,EV sales share,Cars,EV,2011,percent,0.0065
4,Australia,Historical,EV sales share,Cars,EV,2012,percent,0.03


In [86]:
# Displaying Value Counts Per Attribute
with pd.option_context('display.max_rows', None,):
    print('5 largest regions:')
    print(ev['region'].value_counts().nlargest(5))
    print()

with pd.option_context('display.max_rows', None,):
    print('Value Counts by Category:')
    print(ev['category'].value_counts())
    print()    

with pd.option_context('display.max_rows', None,):
    print('Value Counts by Parameter:')
    print(ev['parameter'].value_counts())
    print()
with pd.option_context('display.max_rows', None,):
    print('Value Counts by mode:')
    print(ev['mode'].value_counts())
    print()
with pd.option_context('display.max_rows', None,):
    print('Value Counts by PowerTrain:')
    print(ev['powertrain'].value_counts())
    print()
with pd.option_context('display.max_rows', None,):
    print('Value Counts by Year:')
    print(ev['year'].value_counts())
    print()
with pd.option_context('display.max_rows', None,):
    print('Value Counts by Unit:')
    print(ev['unit'].value_counts())
    print()

5 largest regions:
World                763
China                751
Europe               584
Rest of the world    451
USA                  343
Name: region, dtype: int64

Value Counts by Category:
Historical          5686
Projection-APS       874
Projection-STEPS     874
Name: category, dtype: int64

Value Counts by Parameter:
EV stock                 1723
EV sales                 1609
EV stock share           1096
EV sales share           1059
EV charging points        623
Electricity demand        576
Oil displacement Mbd      374
Oil displacement Mlge     374
Name: parameter, dtype: int64

Value Counts by mode:
Cars         2684
Vans         1625
Buses        1436
Trucks       1058
EV            623
SSPACEHHP       8
Name: mode, dtype: int64

Value Counts by PowerTrain:
EV                         2903
BEV                        2373
PHEV                       1535
Publicly available slow     313
Publicly available fast     310
Name: powertrain, dtype: int64

Value Counts by Year:
2

From looking at the above information from the value counts, we can accertain the target row attributes and useful information. The largest regions for EV sales are China, USA and Europe. In terms of the parameter EV Stock, sales and charging points are of interest. EV charging points should be focused on. Unit should focus on the charging points, sales and stock. Most recent historical data comes from the end of 2021 since this report was published in May 2022.  

### Group Data for Insights

In [None]:
# Sorting & Grouping Data
# ev_sorted = ev.sort_values(by=(5), ascending=[False])
# DF_sorted

# Group Data by Region
ev_region = ev.groupby(['region', 'category'])['category'].count()
print(ev_region)

# Group Data by Region
ev2 = ev.groupby(['region', 'parameter', 'mode'])['category'].count()
print(ev2)

# # Groupby multiple columns and aggregate()
# result = ev_groups.groupby(['Courses','Fee'])['Duration'].aggregate('count')
# print(result)

In [118]:
# Select data by charging points in 2021
ev2 = ev[(ev['parameter'] == 'EV charging points') & (ev['year'] == 2021)]
print('Shape of new data frame: ' + str(ev2.shape))
ev2.head(50)

print('\nRegions with most EV Charging points in 2021:')
ev2['region'].value_counts().nlargest(5)
# ev2['year'].value_counts()

Shape of new data frame: (86, 8)

Regions with most EV Charging points in 2021:


China     6
USA       6
India     6
World     6
Europe    4
Name: region, dtype: int64

In [129]:
# Sorting & Grouping Data
ev_sorted = ev.sort_values(by=('region'), ascending=[False])
print(ev_sorted)

# Group Data by Region
ev_region = ev.groupby(['region', 'unit'])['region'].count()
print(ev_region)

# Group Data by Region
ev2 = ev.groupby(['region', 'parameter', 'mode'])['category'].count()
print(ev2)

# # Groupby multiple columns and aggregate()
# result = ev_groups.groupby(['Courses','Fee'])['Duration'].aggregate('count')
# print(result)

# ev_groups.mean()

         region          category              parameter   mode  \
7433      World  Projection-STEPS  Oil displacement Mlge   Cars   
6921      World        Historical               EV sales  Buses   
6930      World        Historical               EV stock  Buses   
6929      World        Historical               EV stock  Buses   
6928      World        Historical               EV sales   Cars   
...         ...               ...                    ...    ...   
51    Australia        Historical               EV sales   Cars   
52    Australia        Historical               EV sales   Cars   
53    Australia        Historical               EV stock   Cars   
54    Australia        Historical     EV charging points     EV   
0     Australia        Historical               EV sales   Cars   

                   powertrain  year                               unit  \
7433                       EV  2030  Milion litres gasoline equivalent   
6921                      BEV  2017            