# EV Range â€” preprocessing and model training
Run cells sequentially. This notebook loads data, cleans it, creates features, trains a RandomForest baseline and saves the model.

In [1]:
# 1) imports
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

BASE = Path.cwd()
DATA_PATH = BASE / "data" / "ev_dataset.csv"
MODEL_DIR = BASE / "models"
MODEL_DIR.mkdir(exist_ok=True)
print("BASE", BASE)


BASE c:\Users\divya\OneDrive\Documents\GitHub\Enter-Week-1\notebooks


In [4]:
# 2) cleaning function
def _find_column(df, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for name in candidates:
        key = name.lower()
        if key in cols_lower:
            return cols_lower[key]
    return None

def clean_data(df):
    mapping = {}
    battery_col = _find_column(df, ['Battery', 'Battery_Capacity_kWh', 'battery_kwh'])
    power_col = _find_column(df, ['Power', 'Power_hp', 'Power_kW', 'Motor_Power'])
    efficiency_col = _find_column(df, ['Efficiency', 'Efficiency_WhPerKm', 'Energy_Consumption'])
    weight_col = _find_column(df, ['Weight', 'Weight_kg', 'Vehicle_Weight'])
    range_col = _find_column(df, ['Range', 'Range_km', 'range_km', 'range'])
    if battery_col: mapping[battery_col] = 'Battery_Capacity_kWh'
    if power_col: mapping[power_col] = 'Power_hp'
    if efficiency_col: mapping[efficiency_col] = 'Efficiency_WhPerKm'
    if weight_col: mapping[weight_col] = 'Weight_kg'
    if range_col: mapping[range_col] = 'Range_km'
    if mapping:
        df = df.rename(columns=mapping)
    wanted = [c for c in ['Battery_Capacity_kWh','Power_hp','Efficiency_WhPerKm','Weight_kg','Range_km'] if c in df.columns]
    if not wanted:
        return df.copy()
    df = df[wanted].copy()
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    df = df.dropna().drop_duplicates().reset_index(drop=True)
    return df


In [6]:
# 3) load & clean
if not DATA_PATH.exists():
    # try to locate the file elsewhere in the project
    candidates = list(BASE.rglob('ev_dataset.csv'))
    if candidates:
        DATA_PATH = candidates[0]
        print(f"Found dataset at {DATA_PATH}")
    else:
        # create a small example dataset so the notebook can continue
        print(f"Dataset not found at {DATA_PATH}. Creating a small example dataset to proceed.")
        DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
        sample = pd.DataFrame({
            'Battery_Capacity_kWh': [50, 75, 100],
            'Power_hp': [200, 250, 300],
            'Efficiency_WhPerKm': [150, 180, 170],
            'Weight_kg': [1500, 1700, 1800],
            'Range_km': [300, 400, 500],
        })
        sample.to_csv(DATA_PATH, index=False)
        print(f"Saved example dataset to {DATA_PATH}")

raw = pd.read_csv(DATA_PATH)
print('raw shape', raw.shape)
df = clean_data(raw)
print('cleaned shape', df.shape)
df.head()

Dataset not found at c:\Users\divya\OneDrive\Documents\GitHub\Enter-Week-1\notebooks\data\ev_dataset.csv. Creating a small example dataset to proceed.
Saved example dataset to c:\Users\divya\OneDrive\Documents\GitHub\Enter-Week-1\notebooks\data\ev_dataset.csv
raw shape (3, 5)
cleaned shape (3, 5)


Unnamed: 0,Battery_Capacity_kWh,Power_hp,Efficiency_WhPerKm,Weight_kg,Range_km
0,50,200,150,1500,300
1,75,250,180,1700,400
2,100,300,170,1800,500


In [7]:
# 4) feature engineering
if 'Battery_Capacity_kWh' in df.columns:
    df['battery_Wh'] = df['Battery_Capacity_kWh'] * 1000.0
if 'Efficiency_WhPerKm' in df.columns and 'battery_Wh' in df.columns:
    df['battery_over_eff'] = df['battery_Wh'] / df['Efficiency_WhPerKm']
if 'Battery_Capacity_kWh' in df.columns and 'Weight_kg' in df.columns:
    df['energy_density_kWh_per_kg'] = df['Battery_Capacity_kWh'] / df['Weight_kg'].replace(0, np.nan)
df.shape

(3, 8)

In [8]:
# 5) prepare train/test and preprocess numeric features
if 'Range_km' not in df.columns:
    raise SystemExit("Missing target Range_km")
feature_cols = [c for c in df.columns if c != 'Range_km']
numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
print('numeric features:', numeric_cols)
for c in numeric_cols:
    lo, hi = df[c].quantile([0.01, 0.99])
    df[c] = df[c].clip(lower=lo, upper=hi)
X = df[feature_cols].copy()
y = df['Range_km'].astype(float).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_train_num = imputer.fit_transform(X_train[numeric_cols])
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = imputer.transform(X_test[numeric_cols])
X_test_num = scaler.transform(X_test_num)
import pandas as pd
X_train_p = pd.DataFrame(X_train_num, columns=numeric_cols)
X_test_p = pd.DataFrame(X_test_num, columns=numeric_cols)
out_dir = BASE / 'data'
out_dir.mkdir(exist_ok=True)
X_train_p.to_csv(out_dir / 'X_train_preprocessed.csv', index=False)
X_test_p.to_csv(out_dir / 'X_test_preprocessed.csv', index=False)
y_train.to_csv(out_dir / 'y_train.csv', index=False)
y_test.to_csv(out_dir / 'y_test.csv', index=False)
joblib.dump({'imputer': imputer, 'scaler': scaler, 'numeric_cols': numeric_cols}, MODEL_DIR / 'preprocessor.pkl')
print('Saved preprocessed data and preprocessor')

numeric features: ['Battery_Capacity_kWh', 'Power_hp', 'Efficiency_WhPerKm', 'Weight_kg', 'battery_Wh', 'battery_over_eff', 'energy_density_kWh_per_kg']
Saved preprocessed data and preprocessor


In [11]:
# 6) train baseline RandomForest (small grid)
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
param_grid = {'n_estimators':[100,300], 'max_depth':[8,16,None], 'min_samples_leaf':[1,3]}

# ensure cv is not larger than number of training samples
cv_folds = min(5, max(2, len(y_train)))

gs = GridSearchCV(rf, param_grid, scoring='neg_mean_absolute_error', cv=cv_folds, n_jobs=-1, verbose=1)
gs.fit(X_train_p, y_train)
best = gs.best_estimator_
print('best params:', gs.best_params_)
cv_mae = -cross_val_score(best, X_train_p, y_train, cv=cv_folds, scoring='neg_mean_absolute_error').mean()
print('Train CV MAE:', cv_mae)
y_pred = best.predict(X_test_p)
print('Test MAE:', mean_absolute_error(y_test, y_pred))
# older sklearn versions may not support the `squared` kwarg; compute RMSE manually
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Test RMSE:', rmse)
print('Test R2:', r2_score(y_test, y_pred))
joblib.dump(best, MODEL_DIR / 'ev_range_model.pkl')
print('Saved model to', MODEL_DIR / 'ev_range_model.pkl')

Fitting 2 folds for each of 12 candidates, totalling 24 fits
best params: {'max_depth': 8, 'min_samples_leaf': 1, 'n_estimators': 100}
Train CV MAE: 100.0
Test MAE: 131.0
Test RMSE: 131.0
Test R2: nan
Saved model to c:\Users\divya\OneDrive\Documents\GitHub\Enter-Week-1\notebooks\models\ev_range_model.pkl


