In [61]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import DMatrix, cv
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression



Pre-Processing

In [62]:
def process_data(df):
    df = df.drop(columns=['Region'])


    
    unknown_mapping = {
        'T1': '1', 'T2': '1',
        'T3': '2', 'T4': '2',
        'T5': '3', 'T6': '4',
        'T7': '8',
    }

    df.loc[df['GVWR Class'] == 'Unknown', 'GVWR Class'] = df.loc[df['GVWR Class'] == 'Unknown', 'Vehicle Category'].map(unknown_mapping)
    df["GVWR Class"] = df["GVWR Class"].replace("Not Applicable", 0)
    df["GVWR Class"] = df["GVWR Class"].astype(int)

    df['Number of Vehicles Registered at the Same Address'] = df['Number of Vehicles Registered at the Same Address'].replace('≥4', '4')
    mode_value = df["Number of Vehicles Registered at the Same Address"].mode()[0]
    df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace("Unknown", mode_value)
    df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].astype(int)

    df["Date"] = df["Date"].astype(int)

    median_year = df['Model Year'].median(skipna=True)
    df['Model Year'] = df['Model Year'].fillna(median_year)
    df["Model Year"] = df["Model Year"].astype(int)

    df["vehicle_age"] = df["Date"] - df["Model Year"]

    df.loc[(df["Fuel Type"] == "Unknown") & (df["GVWR Class"] == 0), "Fuel Type"] = "Electric"
    df.loc[(df["Fuel Type"] == "Unknown") & (df["GVWR Class"] != 0), "Fuel Type"] = "Gasoline"
    
   
    fuel_type_weights = {0: 0.5, 1: 0.8, 2: 1.0, 3: 1.5}  # Electric, Hybrid, Gasoline, Diesel
    vehicle_category_weights = {0: 1.0, 1: 1.2, 2: 1.5, 3: 2.0}  # Passenger Car, Light Truck, Medium Truck, Heavy Truck

    df["Fuel Type Weight"] = df["Fuel Type"].map(fuel_type_weights)
    df["Vehicle Category Weight"] = df["Vehicle Category"].map(vehicle_category_weights)
    df["Vehicle Maintenance"] = (df["vehicle_age"] * df["Fuel Type Weight"]) + df["Vehicle Category Weight"]

    df.drop(columns=["Fuel Type Weight", "Vehicle Category Weight"], inplace=True)
    df = pd.get_dummies(df, columns=['Fuel Type', 'Fuel Technology'], prefix='', prefix_sep='')
    df = df.drop(columns=["Date", "Model Year"])

    df.loc[(df["Electric Mile Range"] == "Unknown") & (df["GVWR Class"] == 0), "Electric Mile Range"] = "Not Applicable"

    electric_mile_range_mapping = {
        "Not Applicable": 0,
        "0 to 50 miles": 1,
        "51 to 100 miles": 2,
        "101 to 150 miles": 3,
        ">150 miles": 4
    }


    df["Electric Mile Range"] = df["Electric Mile Range"].map(electric_mile_range_mapping)

    mode_value_encoded = df.loc[df["Electric Mile Range"] > 0, "Electric Mile Range"].mode()[0]

    df.loc[
        (df["Electric Mile Range"] == "Unknown") & (df["GVWR Class"] != 0),
        "Electric Mile Range"
    ] = mode_value_encoded

    df = pd.get_dummies(df, columns=["Vehicle Category"], prefix="Vehicle")

    
    return df



Training

In [63]:
df = pd.read_excel("Train.xlsx")
df = process_data(df)
df.to_excel("processed_train.xlsx", index=False)

In [64]:
X = df.drop(columns=['Vehicle Population'])
y = df['Vehicle Population']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
################################################# 
def train_polynomial_regression(X_train, y_train, degree=4):
    poly = PolynomialFeatures(degree=degree)
    vehicle_age = X_train['vehicle_age'].values.reshape(-1, 1)
    poly_features = poly.fit_transform(vehicle_age)

    poly_model = LinearRegression()
    poly_model.fit(poly_features, y_train)
    
    return poly, poly_model

def add_polynomial_feature(X, poly, poly_model):
    vehicle_age = X['vehicle_age'].values.reshape(-1, 1)
    poly_features = poly.transform(vehicle_age)
    return poly_model.predict(poly_features)

poly, poly_model = train_polynomial_regression(X_train, y_train)
X_train['poly_vehicle_population'] = add_polynomial_feature(X_train, poly, poly_model)
X_test['poly_vehicle_population'] = add_polynomial_feature(X_test, poly, poly_model)


#################################################

bool_cols = X_train.select_dtypes(include=['bool', 'int']).columns
num_cols = X_train.select_dtypes(include=['float', 'int']).columns.difference(bool_cols)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled = X_test.copy()
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

with open("scaler.pkl", "wb") as file:  
    pickle.dump(scaler, file)

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}



random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

with open("model.pkl", "wb") as file:
    pickle.dump(best_model, file)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}
Mean Squared Error: 64695718.148840465
Root Mean Squared Error: 8043.364852401044
R-squared: 0.8129250130413539


Testing

In [65]:
df_test = pd.read_excel("Test.xlsx")
true_values = df_test['Vehicle Population']
df_test = process_data(df_test)

df_test.to_excel("processed_test.xlsx", index=False)

df_test = df_test.drop(columns=['Vehicle Population'])

df_test['poly_vehicle_population'] = add_polynomial_feature(df_test, poly, poly_model)

with open("model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
with open("scaler.pkl", "rb") as file:
    scaler1 = pickle.load(file)

df_test[num_cols] = scaler1.transform(df_test[num_cols])

df_pred = loaded_model.predict(df_test)

mse = mean_squared_error(true_values, df_pred)
rmse = np.sqrt(mse)
r2 = r2_score(true_values, df_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


Mean Squared Error: 72340313.79568544
Root Mean Squared Error: 8505.310917049737
R-squared: 0.8090991211228695
