In [86]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [87]:
data = pd.read_csv('ev_vs_petrol_dataset_v3.csv', usecols=['country', 'year', 'ev_sales', 'ev_market_share', 'urban_population_percent', 'is_ev_dominant'])
data.head()

Unnamed: 0,country,year,ev_sales,ev_market_share,urban_population_percent,is_ev_dominant
0,Australia,2010,5,0.0,88.8,0
1,Australia,2010,57,0.01,88.8,0
2,Australia,2010,37,0.01,88.8,0
3,Australia,2011,11,0.01,88.9,0
4,Australia,2011,129,0.02,88.9,0


In [88]:
data.isnull().sum()

country                     0
year                        0
ev_sales                    0
ev_market_share             0
urban_population_percent    0
is_ev_dominant              0
dtype: int64

In [89]:
data.duplicated().sum()

np.int64(0)

In [90]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   1200 non-null   object 
 1   year                      1200 non-null   int64  
 2   ev_sales                  1200 non-null   int64  
 3   ev_market_share           1200 non-null   float64
 4   urban_population_percent  1200 non-null   float64
 5   is_ev_dominant            1200 non-null   int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 56.4+ KB


In [91]:
data.describe()

Unnamed: 0,year,ev_sales,ev_market_share,urban_population_percent,is_ev_dominant
count,1200.0,1200.0,1200.0,1200.0,1200.0
mean,2017.5,63602.36,6.327992,74.609,0.018333
std,4.611694,407950.6,13.231723,14.858811,0.13421
min,2010.0,5.0,0.0,30.9,0.0
25%,2013.75,200.25,0.0775,64.175,0.0
50%,2017.5,2731.5,0.83,79.9,0.0
75%,2021.25,19905.0,5.8325,84.075,0.0
max,2025.0,7670056.0,95.0,98.5,1.0


In [92]:
data.shape

(1200, 6)

In [93]:
from ydata_profiling import ProfileReport 
df = ProfileReport(data) 
df.to_file(output_file='Ev_dataset.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 121.50it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [94]:
data.head()

Unnamed: 0,country,year,ev_sales,ev_market_share,urban_population_percent,is_ev_dominant
0,Australia,2010,5,0.0,88.8,0
1,Australia,2010,57,0.01,88.8,0
2,Australia,2010,37,0.01,88.8,0
3,Australia,2011,11,0.01,88.9,0
4,Australia,2011,129,0.02,88.9,0


In [95]:
# 1. Keep only 4 main countries
main_countries = ['Australia', 'China', 'India', 'USA']

data['country'] = data['country'].apply(
    lambda x: x if x in main_countries else 'Other'
)

# 2. Train-test split
from sklearn.model_selection import train_test_split

X = data.drop('is_ev_dominant', axis=1)
y = data['is_ev_dominant']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1
)

# 3. OneHotEncoder for 'country'
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer(
    transformers=[
        ('country_ohe', OneHotEncoder(sparse_output=False), ['country'])
    ],
    remainder='passthrough'
)

x_train_enc = transformer.fit_transform(x_train)
x_test_enc = transformer.transform(x_test)

# 4. Column names
ohe = transformer.named_transformers_['country_ohe']
ohe_cols = ohe.get_feature_names_out(['country'])     # 4 columns

other_cols = [c for c in x_train.columns if c != 'country']

final_cols = list(ohe_cols) + other_cols

# 5. Convert to DataFrame
x_train_df = pd.DataFrame(x_train_enc, columns=final_cols)
x_test_df = pd.DataFrame(x_test_enc, columns=final_cols)

x_train_df.head()

Unnamed: 0,country_Australia,country_China,country_India,country_Other,year,ev_sales,ev_market_share,urban_population_percent
0,0.0,0.0,0.0,1.0,2018.0,11922.0,2.61,81.4
1,0.0,0.0,0.0,1.0,2010.0,170.0,0.06,83.0
2,0.0,0.0,0.0,1.0,2012.0,24.0,0.01,68.4
3,0.0,0.0,0.0,1.0,2024.0,24008.0,8.66,81.3
4,0.0,0.0,0.0,1.0,2017.0,157.0,0.47,64.5


In [97]:
x_train_df.to_csv('ev_dataset.csv', index=False)