In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
# use pandas to make a df
base_path = os.path.dirname(os.getcwd())
df = pd.read_csv(os.path.join(base_path,r'data\processed\Encoded_data.csv'),index_col="Unnamed: 0")
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,traffic_volume,Day,Month,Year,time_category,x0_Clear,x0_Cloudy,x0_Rainy
0,0.0,15.13,0.0,0.0,40.0,5545.0,4.0,2.0,2012.0,1.0,0.0,1.0,0.0
1,0.0,16.21,0.0,0.0,75.0,4516.0,4.0,2.0,2012.0,1.0,0.0,1.0,0.0
2,0.0,16.43,0.0,0.0,90.0,4767.0,4.0,2.0,2012.0,1.0,0.0,1.0,0.0
3,0.0,16.98,0.0,0.0,90.0,5026.0,4.0,2.0,2012.0,2.0,0.0,1.0,0.0
4,0.0,17.99,0.0,0.0,75.0,4918.0,4.0,2.0,2012.0,2.0,0.0,1.0,0.0


In [3]:
x = df.drop(columns=["traffic_volume",],axis = 1)
y =df["traffic_volume"]

In [7]:
 x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
def train_test_various_models(models, x_train, x_test, y_train, y_test):
    best_model = None
    best_r2_score = -float('inf')
    
    for model_name, model in models.items():
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        
        r2 = r2_score(y_test, pred)
        mse = np.sqrt(mean_squared_error(y_test, pred))
        
        print(f"R2 score of data using {model_name}: {r2}")
        print(f"Root Mean Squared Error of data using {model_name}: {mse}\n")
        
        if r2 > best_r2_score:
            best_r2_score = r2
            best_model = model
    
    return best_model

In [9]:
models = {
    'linear_regression': LinearRegression(),
    'decision_tree_regressor': DecisionTreeRegressor(),
    'random_forest_regressor': RandomForestRegressor(),
    'Gradient_Boosting_regressor' :  GradientBoostingRegressor()
}

In [10]:
best_model = train_test_various_models(models, x_train, x_test, y_train, y_test)
print("Best Model:", best_model)

R2 score of data using linear_regression: 0.25702566012213035
Root Mean Squared Error of data using linear_regression: 1709.5704379604117

R2 score of data using decision_tree_regressor: 0.6270488964741856
Root Mean Squared Error of data using decision_tree_regressor: 1211.2283858866863

R2 score of data using random_forest_regressor: 0.7760746352848
Root Mean Squared Error of data using random_forest_regressor: 938.537731691811

R2 score of data using Gradient_Boosting_regressor: 0.7095725038067416
Root Mean Squared Error of data using Gradient_Boosting_regressor: 1068.8554040788385

Best Model: RandomForestRegressor()


In [None]:
pickle_file = os.path.join(base_path, 'saved_models', 'best_model.pickle')
with open(pickle_file, 'wb') as f:
    pickle.dump(best_model, f, protocol=2)