# MODEL TRAInING

### 1.1 Installing some necessary models

In [1]:
pip install catboost xgboost lightgbm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### 1.2 Importing the required Libraries

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

### 1.3 Importing the required libraries and models of sklearn (scikit learn)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [4]:
#Models
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

### 1.4 Loading the Processed Datasets

In [5]:
# Loading the processed datasets 
path_df1 = "../data/processed/ev_spatial_preprocessed.csv.gz"
path_df2 = "../data/processed/temporal_forecast_result.csv"
path_df3 = "../data/processed/EV_specs.csv"

# load (df1 is gzipped)
df1 = pd.read_csv(path_df1, compression='gzip' if path_df1.endswith('.gz') else None)
df2 = pd.read_csv(path_df2)
df3 = pd.read_csv(path_df3)

In [6]:
print("Shapes of the datasets respectively :" "\n" "df1:", df1.shape, "\n" "df2:", df2.shape,"\n" "df3:", df3.shape)

Shapes of the datasets respectively :
df1: (177866, 99) 
df2: (1320, 49) 
df3: (360, 8)


In [7]:
# Creating this function so that it can work as a separtaor
def print_sep():
    print("\n" + "="*80 + "\n")

### 2.1 Converting the column name into lower case for ease

In [8]:
df1.columns = df1.columns.str.lower().str.replace(' ', '_')
df2.columns = df2.columns.str.lower().str.replace(' ', '_')
df3.columns = df3.columns.str.lower().str.replace(' ', '_')

# Checking the column names
print("df1 columns sample:", df1.columns[:20].tolist())
print_sep()
print("df2 columns sample:", df2.columns[:20].tolist())
print_sep()
print("df3 columns sample:", df3.columns[:20].tolist())

df1 columns sample: ['country', 'city', 'postal_code', 'model_year', 'model', 'electric_vehicle_type', 'cafv_eligibility', 'electric_range', 'legislative_district', 'electric_utility', '2020_census_tract', 'state_ae', 'state_ak', 'state_al', 'state_ap', 'state_ar', 'state_az', 'state_bc', 'state_ca', 'state_co']


df2 columns sample: ['user_id', 'battery_capacity_(kwh)', 'charging_station_id', 'energy_consumed_(kwh)', 'charging_duration_(hours)', 'charging_rate_(kw)', 'charging_cost_(usd)', 'time_of_day', 'day_of_week', 'state_of_charge_(start_%)', 'state_of_charge_(end_%)', 'distance_driven_(since_last_charge)_(km)', 'temp(°c)', 'vehicle_age_(years)', 'charger_type', 'vehicle_model_bmw_i3', 'vehicle_model_chevy_bolt', 'vehicle_model_hyundai_kona', 'vehicle_model_nissan_leaf', 'vehicle_model_tesla_model3']


df3 columns sample: ['battery', 'efficiency', 'fast_charge', 'price.de.', 'range', 'top_speed', 'acceleration..0.100.', 'price_usd_estimated']


### 2.2 Displaying the Datasets Respectively

In [9]:
df1.head()

Unnamed: 0,country,city,postal_code,model_year,model,electric_vehicle_type,cafv_eligibility,electric_range,legislative_district,electric_utility,...,make_smart,make_subaru,make_tesla,make_th!nk,make_toyota,make_volkswagen,make_volvo,make_wheego_electric_cars,longitude,latitude
0,0.950721,2.185227,-0.020657,-0.172448,1.443448,0,0,2.523975,0.528607,-0.290798,...,0,0,1,0,0,0,0,0,-122,48
1,-0.777479,-0.144236,-0.062009,0.831106,1.443448,0,1,-0.639721,-1.891247,-0.104446,...,0,0,1,0,0,0,0,0,-122,48
2,0.950721,2.185227,-0.025979,-0.506966,-0.501228,0,0,2.295667,0.461389,-0.290798,...,0,0,1,0,0,0,0,0,-122,48
3,0.950721,-0.450334,-0.059553,-1.51052,-0.501228,0,0,1.643359,-1.622374,1.142097,...,0,0,1,0,0,0,0,0,-122,48
4,-1.142709,-0.716527,0.08989,0.16207,1.443448,0,1,-0.639721,-0.412447,-0.104446,...,0,0,1,0,0,0,0,0,-123,48


In [10]:
df2.head()

Unnamed: 0,user_id,battery_capacity_(kwh),charging_station_id,energy_consumed_(kwh),charging_duration_(hours),charging_rate_(kw),charging_cost_(usd),time_of_day,day_of_week,state_of_charge_(start_%),...,charging_duration_lag_1,charging_duration_lag_2,energy_consumed_roll_mean_3,energy_consumed_roll_std_3,charging_duration_roll_mean_3,charging_duration_roll_std_3,start_hour_sin,start_hour_cos,weekday_sin,weekday_cos
0,1,1.64548,391,0.827415,-1.582085,0.765064,-0.880549,2,2,-0.821044,...,-0.012008,-0.012829,0.031932,-0.015945,-0.011086,-0.049419,0.965926,0.258819,-0.972958,0.230983
1,2,1.235035,428,-1.387924,0.814865,0.346684,-0.132395,0,1,-1.621201,...,-1.582178,-0.012829,0.031932,-0.015945,-0.011086,-0.049419,0.866025,-0.5,-0.972958,0.230983
2,3,0.022567,181,-1.076981,0.172798,0.114901,1.220378,0,4,-1.756717,...,0.816522,-1.581946,-0.960906,0.708437,-0.348877,0.74662,0.258819,-0.965926,-0.972958,0.230983
3,4,-1.189901,327,1.6859,-0.94561,0.508216,-0.885339,2,6,1.412424,...,0.173986,0.816946,-0.448358,1.927954,0.025122,-0.00586,-0.5,-0.866025,-0.972958,0.230983
4,5,-1.189901,108,-1.054072,-0.235342,-1.15222,-1.152824,0,6,0.213128,...,-0.945239,0.174358,-0.249036,1.671788,-0.591991,-0.704745,-0.866025,-0.5,-0.972958,0.230983


In [11]:
df3.head()

Unnamed: 0,battery,efficiency,fast_charge,price.de.,range,top_speed,acceleration..0.100.,price_usd_estimated
0,0.187241,-0.727285,0.496939,-0.21123,0.609385,0.997205,-0.762526,-0.211238
1,-0.672224,-1.825666,0.624118,-0.609628,0.469416,0.554993,-0.396069,-0.609613
2,-0.524887,-0.38208,-0.774848,-0.659284,-0.3704,-0.578178,0.003702,-0.65928
3,-0.465953,-0.758668,0.327367,-0.803582,-0.090462,-0.578178,0.203587,-0.803584
4,0.187241,-1.449078,0.963261,-0.329439,1.262575,0.554993,-0.962411,-0.329424


### 2.3 Defining targets for the outputs generation

In [12]:
TARGETS = {
    "charging_energy": {
        "column": "energy_consumed_(kwh)",
        "type": "regression",
        "dataset": "df2"
    },
    "charging_duration": {
        "column": "charging_duration_(hours)",
        "type": "regression",
        "dataset": "df2"
    },
    "vehicle_range": {
        "column": "range",
        "type": "regression",
        "dataset": "df3"
    }
}

### 2.4 Candidate Models (Per Target)

This defines :--
Which models are allowed for each target

In [13]:
CANDIDATE_MODELS = {
    "charging_energy": [
        ("XGBoost", XGBRegressor(random_state=42)),
        ("LightGBM", LGBMRegressor(random_state=42))
    ],
    "charging_duration": [
        ("XGBoost", XGBRegressor(random_state=42)),
        ("LightGBM", LGBMRegressor(random_state=42))
    ],
    "vehicle_range": [
        ("LightGBM", LGBMRegressor(random_state=42)),
        ("RandomForest", RandomForestRegressor(random_state=42, n_jobs=-1)),
    ]
}

### 2.5 Evaluation Function

Evaluating regression models using MAE for interpretability, RMSE to penalize large errors, and R^2 to measure variance explained.

In [14]:
def evaluate_regression(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2": r2_score(y_true, y_pred)
    }

### 2.6 Storing results of Model performance

In [15]:
MODEL_RESULTS = {}

### 3.Creating Train/Test Split as function so it can be usable for all three datasets 

In [16]:
def train_target_models(df, target):
    target_column = TARGETS[target]["column"]
    
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    MODEL_RESULTS[target] = {}

    # Train and evaluate each model
    for name, model in CANDIDATE_MODELS[target]:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metrics = evaluate_regression(y_test, y_pred)

        MODEL_RESULTS[target][name] = {
            "model": model,
            "metrics": metrics
        }

        print(f"{target} | {name} → {metrics}")


### 3.1 Train Charging Energy Models

In [17]:
train_target_models(df2, "charging_energy")

charging_energy | XGBoost → {'MAE': 0.2979042498744837, 'RMSE': np.float64(0.4326805586441739), 'R2': 0.820063621111181}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4653
[LightGBM] [Info] Number of data points in the train set: 1056, number of used features: 47
[LightGBM] [Info] Start training from score -0.014847
charging_energy | LightGBM → {'MAE': 0.2615681329867987, 'RMSE': np.float64(0.38451210644735084), 'R2': 0.8578966719455492}


### 3.2 Train Charging Duration Models

In [18]:
train_target_models(df2, "charging_duration")

charging_duration | XGBoost → {'MAE': 0.19694272129159207, 'RMSE': np.float64(0.3685493746463126), 'R2': 0.8678762355022236}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4653
[LightGBM] [Info] Number of data points in the train set: 1056, number of used features: 47
[LightGBM] [Info] Start training from score -0.000353
charging_duration | LightGBM → {'MAE': 0.17648122998666746, 'RMSE': np.float64(0.3197479203867138), 'R2': 0.9005499423897476}


### 3.3 Train Vehicle Range Models

In [19]:
df3.columns

Index(['battery', 'efficiency', 'fast_charge', 'price.de.', 'range',
       'top_speed', 'acceleration..0.100.', 'price_usd_estimated'],
      dtype='object')

In [20]:
train_target_models(df3, "vehicle_range")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 414
[LightGBM] [Info] Number of data points in the train set: 288, number of used features: 7
[LightGBM] [Info] Start training from score 0.026827
vehicle_range | LightGBM → {'MAE': 0.1040678828906939, 'RMSE': np.float64(0.14657441615949315), 'R2': 0.9765343559153432}
vehicle_range | RandomForest → {'MAE': 0.1011666963351609, 'RMSE': np.float64(0.1572278487328232), 'R2': 0.9729992966260431}


### 4. Model Selection Function

### 4.1 Function to pick the best model

In [21]:
def select_best_model(target_name, metric="RMSE"):
    results = MODEL_RESULTS[target_name]
    best_model_name = min(
        results,
        key=lambda model_name: results[model_name]["metrics"][metric]
    )
    return best_model_name, results[best_model_name]


### 4.2 Select Final Models

In [22]:
best_energy_model = select_best_model("charging_energy")
best_duration_model = select_best_model("charging_duration")
best_range_model = select_best_model("vehicle_range")

print("Best Charging Energy Model:", best_energy_model[0])
print("Best Charging Duration Model:", best_duration_model[0])
print("Best Vehicle Range Model:", best_range_model[0])

Best Charging Energy Model: LightGBM
Best Charging Duration Model: LightGBM
Best Vehicle Range Model: LightGBM
