# Importing Libraries

In [5]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# reading the data from the file
df = pd.read_excel('../Data Preprocessing & Cleaning/encoded_car_data.xlsx')

# Normalizing Numerical Features

In [7]:
X = df.drop('price',axis=1)         # features
y = df['price']                     # target

# splitting the data into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()                     # creating an object of StandardScaler class 
X_train = scaler.fit_transform(X_train)     # scaling the columns 

# saving the scaler object
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

X_test = scaler.transform(X_test)     # scaling the columns    

df.head(2) # printing the first 2 rows of the dataframe

Unnamed: 0,Fuel type,Body type,Kilometers driven,transmission,ownerNo,model,modelYear,variantName,price,Registration Year,Insurance Validity,Mileage(kmpl),Engine(CC),Max Power(bhp),Torque(Nm),City
0,4,3,120000,1,3,135,2015,1616,400000.0,2015,5,23.1,998,67.04,90.0,0
1,4,7,32706,1,2,41,2018,279,811000.0,2018,2,17.0,1497,121.31,150.0,0


# Model Training

In [8]:
# dictionary of models to be used for training
models = {  
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
}

# Model Training with Cross-validation
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    results[name] = scores.mean()

print("Cross-Validation Results:")
for model_name, cv_score in results.items():
    print(f"{model_name}: {cv_score:.4f} (R^2)")

Cross-Validation Results:
Linear Regression: 0.7131 (R^2)
Decision Tree: 0.8446 (R^2)
Random Forest: 0.9154 (R^2)
Gradient Boosting: 0.8900 (R^2)


# Model Comparison

In [9]:
# Comparing the models based on evaluation metrics to select the best performing model
models = ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Gradient Boosting Regressor']

# Training the models
lr = LinearRegression().fit(X_train, y_train)
dt = DecisionTreeRegressor().fit(X_train, y_train)
rf = RandomForestRegressor().fit(X_train, y_train)
gb = GradientBoostingRegressor().fit(X_train, y_train)

# Evaluating the models
mae = [mean_absolute_error(y_test, lr.predict(X_test)),
    mean_absolute_error(y_test, dt.predict(X_test)),
    mean_absolute_error(y_test, rf.predict(X_test)),
    mean_absolute_error(y_test, gb.predict(X_test))]

mse = [mean_squared_error(y_test, lr.predict(X_test)),
    mean_squared_error(y_test, dt.predict(X_test)),
    mean_squared_error(y_test, rf.predict(X_test)),
    mean_squared_error(y_test, gb.predict(X_test))]

r2 = [r2_score(y_test, lr.predict(X_test)),
      r2_score(y_test, dt.predict(X_test)),
      r2_score(y_test, rf.predict(X_test)),
      r2_score(y_test, gb.predict(X_test))]

comparison_df = pd.DataFrame({'Model': models, 'MAE': mae, 'MSE': mse, 'R2 Score': r2})
comparison_df

Unnamed: 0,Model,MAE,MSE,R2 Score
0,Linear Regression,226931.317456,131876700000.0,0.721803
1,Decision Tree Regressor,130825.185749,63402640000.0,0.866251
2,Random Forest Regressor,98562.405698,32741890000.0,0.93093
3,Gradient Boosting Regressor,121511.324253,43715630000.0,0.907781


### After comparing the models, we can see that the Random Forest Regressor has the best performance based on the evaluation metrics.

In [10]:
# Saving the model in a pickle file
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(rf, f)