# Clean Model Code

In [1]:
# Libraries
import pandas as pd
import numpy as np
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor


In [2]:
# 1. Load Data
def load_data():
    base_path = "/content/Data/"

    # Load EV and ICE datasets
    ev = pd.read_csv(base_path + "Pure electric consumption.csv")
    diesel = pd.read_csv(base_path + "Diesel consumption.csv")
    petrol91 = pd.read_csv(base_path + "petrol91RON consumption.csv")
    petrol95 = pd.read_csv(base_path + "petrol95RON consumption.csv")
    petrol98 = pd.read_csv(base_path + "petrol98RON consumption.csv")

    return ev, diesel, petrol91, petrol95, petrol98

In [3]:
# 2. Preprocess Data
def prepare_data():
    ev, diesel, petrol91, petrol95, petrol98 = load_data()

    # Drop missing-value columns
    dfs_cleaned = [df.dropna(axis=1) for df in [ev, diesel, petrol91, petrol95, petrol98]]
    ev, diesel, petrol91, petrol95, petrol98 = dfs_cleaned

    # Calculate EV COâ‚‚ emissions
    emission_factor = 0.18  # kg/kWh
    ev["EV_gCO2_per_km"] = (
        ev["EnergyConsumptionWhkm"] / 1000 * emission_factor * 1000
    )

    # Add ICE baselines
    def add_baseline(df, fuel):
        if fuel.lower().startswith("petrol"):
            df["ICE_CO2_Baseline"] = df["FuelConsumptionCombined"] * 23.2
        else:
            df["ICE_CO2_Baseline"] = df["FuelConsumptionCombined"] * 26.5
        df["FuelType"] = fuel
        return df

    petrol91 = add_baseline(petrol91, "Petrol91")
    petrol95 = add_baseline(petrol95, "Petrol95")
    petrol98 = add_baseline(petrol98, "Petrol98")
    diesel   = add_baseline(diesel,   "Diesel")

    # Combine ICE datasets
    ice_all = pd.concat([petrol91, petrol95, petrol98, diesel], ignore_index=True)

    # Cartesian Join (smaller sample for training)
    df = (
        ev.assign(key=1)
        .merge(ice_all.assign(key=1), on="key", suffixes=("_EV", "_ICE"))
        .drop("key", axis=1)
        .sample(n=4000, random_state=42)
    )

    # Create final features
    df["YearDiff"] = df["ModelReleaseYear_EV"] - df["ModelReleaseYear_ICE"]
    df["CO2_saving"] = df["ICE_CO2_Baseline"] - df["EV_gCO2_per_km"]

    X = df[[
        "Make_EV", "Make_ICE",
        "BodyStyle_EV", "BodyStyle_ICE",
        "FuelType_ICE", "YearDiff", "ICE_CO2_Baseline"
    ]]

    y = df["CO2_saving"]

    return X, y


In [4]:
# 3. Build Preprocessing Pipeline
def build_preprocessor():
    categorical_cols = [
        "Make_EV", "Make_ICE",
        "BodyStyle_EV", "BodyStyle_ICE",
        "FuelType_ICE"
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
        ],
        remainder="passthrough",
    )
    return preprocessor

In [5]:
# 4. Train and Save the Model
def train_model():

    print("Loading and preparing data...")
    X, y = prepare_data()

    print("Building model pipeline...")
    preprocessor = build_preprocessor()

    model = GradientBoostingRegressor(random_state=42)

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    print("Training model...")
    pipeline.fit(X, y)

    print("Saving model to co2_savings_model.pkl...")
    joblib.dump(pipeline, "co2_savings_model.pkl")

    print("Training complete!")
    return pipeline



In [6]:
# 5. Prediction Function (Used by FastAPI)
def load_model():
    return joblib.load("co2_savings_model.pkl")


def predict_savings(input_dict):
    """
    input_dict example:
    {
      "Make_EV": "Tesla",
      "Make_ICE": "Toyota",
      "BodyStyle_EV": "SUV",
      "BodyStyle_ICE": "SUV",
      "FuelType_ICE": "Petrol95",
      "YearDiff": 5,
      "ICE_CO2_Baseline": 220.4
    }
    """
    model = load_model()

    input_df = pd.DataFrame([input_dict])

    prediction = model.predict(input_df)[0]

    return {"Predicted_CO2_Savings": float(prediction)}


# Execute Training If Run Directly

if __name__ == "__main__":
    train_model()

Loading and preparing data...
Building model pipeline...
Training model...
Saving model to co2_savings_model.pkl...
Training complete!


In [7]:
# Prediction Function
sample_input = {
    "Make_EV": "Tesla",
    "Make_ICE": "Toyota",
    "BodyStyle_EV": "SUV",
    "BodyStyle_ICE": "SUV",
    "FuelType_ICE": "Petrol95",
    "YearDiff": 5,
    "ICE_CO2_Baseline": 220.4
}

predict_savings(sample_input)


{'Predicted_CO2_Savings': 191.07520862997606}

In [8]:
from google.colab import files
files.download("co2_savings_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>