# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# reading the data from the file
df = pd.read_excel('Data Preprocessing & Cleaning\\encoded_car_data.xlsx')

# Model Selection

In [3]:
X = df.drop('price',axis=1)         # features
y = df['price']                     # target

# splitting the data into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# dictionary of models to be used for training
models = {  
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
}

# Model Training

In [4]:
# Model Training with Cross-validation
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    results[name] = scores.mean()

print("Cross-Validation Results:")
for model_name, cv_score in results.items():
    print(f"{model_name}: {cv_score:.4f} (R^2)")

Cross-Validation Results:
Linear Regression: 0.7230 (R^2)
Decision Tree: 0.8460 (R^2)
Random Forest: 0.9198 (R^2)
Gradient Boosting: 0.9018 (R^2)


# Model Comparison

In [8]:
# Comparing the models based on evaluation metrics to select the best performing model
models = ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Gradient Boosting Regressor']

lr = LinearRegression().fit(X_train, y_train)
dt = DecisionTreeRegressor().fit(X_train, y_train)
rf = RandomForestRegressor().fit(X_train, y_train)
gb = GradientBoostingRegressor().fit(X_train, y_train)


mae = [mean_absolute_error(y_test, lr.predict(X_test)),
    mean_absolute_error(y_test, dt.predict(X_test)),
    mean_absolute_error(y_test, rf.predict(X_test)),
    mean_absolute_error(y_test, gb.predict(X_test))]

mse = [mean_squared_error(y_test, lr.predict(X_test)),
    mean_squared_error(y_test, dt.predict(X_test)),
    mean_squared_error(y_test, rf.predict(X_test)),
    mean_squared_error(y_test, gb.predict(X_test))]

r2 = [r2_score(y_test, lr.predict(X_test)),
      r2_score(y_test, dt.predict(X_test)),
      r2_score(y_test, rf.predict(X_test)),
      r2_score(y_test, gb.predict(X_test))]

comparison_df = pd.DataFrame({'Model': models, 'MAE': mae, 'MSE': mse, 'R2 Score': r2})
comparison_df


Unnamed: 0,Model,MAE,MSE,R2 Score
0,Linear Regression,0.171969,0.082537,0.678972
1,Decision Tree Regressor,0.101547,0.039113,0.847869
2,Random Forest Regressor,0.076937,0.022575,0.912193
3,Gradient Boosting Regressor,0.096465,0.032107,0.875121


### After comparing the models, we can see that the Random Forest Regressor has the best performance based on the evaluation metrics.

In [9]:
# Saving the model in a pickle file
import pickle
with open('D:\\Programming\\GUVI PROJECTS CODE\\PROJECT - 3\\Model\\model.pkl', 'wb') as f:
    pickle.dump(rf, f)