# Model Selection
**Reading file from Github**

In [57]:
import pandas as pd

In [59]:
url = 'https://raw.githubusercontent.com/AutoTrend-Dynamics/COMP3610-Final-Project/refs/heads/main/data/Cleaned_and_Encoded_Cars.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Title,Dealer,Rating,Exterior Color,Interior Color,Engine,Mileage,Price,Year,Car Age,...,Safety Features_Backup Camera,Safety Features_Blind Spot Monitor,Safety Features_Brake Assist,Safety Features_LED Headlights,Safety Features_Lane Departure Warning,Safety Features_Rain Sensing Wipers,Safety Features_Rear Cross Traffic Alert,Safety Features_Stability Control,Safety Features_Unknown,Safety Features_Missing
0,Chrysler F-58,Gateway Classic Cars,2.4,Red,White,4 Cylinder,-0.225739,-0.632087,-12.620677,99,...,0,0,0,0,0,0,0,0,1,0
1,Ford Model A Base,Skyway Classics,2.9,Magenta,Tan,350 V8,-0.448202,-0.128626,-12.08321,95,...,0,0,0,0,0,0,0,0,1,0
2,Ford Pickup Truck Base,Greenwood Automotive,4.9,–,–,–,2.191959,0.981138,-10.739543,85,...,0,0,0,0,0,0,0,0,1,0
3,Chevrolet Fleetline Deluxe,California Cars,4.6,Atomic Orange Metallic,Gray,283 V8,-0.778631,-0.253931,-9.261509,74,...,0,0,0,0,0,0,0,0,1,0
4,GMC Suburban,Exotic Motorsports of Oklahoma,4.6,Black,Tan,5.7L V8,-0.775992,0.158074,-9.127142,73,...,0,0,0,0,0,0,0,0,1,0


In [61]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### Separate features and target

In [63]:
X = df.drop('Price', axis=1)
y = df['Price']# Separate features and target

### Identify categorical and numerical columns

In [65]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

## Preprocessing

### Numerical Data

In [67]:
numerical_transformer = SimpleImputer(strategy='mean')

### Categorical data

In [69]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Bundle preprocessing for numerical and categorical data

In [71]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Test, Train and Split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Linear Regression': LinearRegression()
}

### Dictionary to store results
*For ease of visual comparison*

In [77]:
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
    'MSE': mse,
    'R2': r2,
    'Model': pipeline
    }
    
    print(f"{name}:")
    print(f"  MSE: {mse:.4f}")
    print(f"  R2: {r2:.4f}\n")

Random Forest:
  MSE: 0.1744
  R2: 0.8423

XGBoost:
  MSE: 0.1749
  R2: 0.8419

Lasso:
  MSE: 1.0764
  R2: 0.0270

Linear Regression:
  MSE: 0.4394
  R2: 0.6029



### Determining best model 
*Based on R2 value*

In [79]:
# The best model based on R2 score
best_model_name = max(results, key=lambda x: results[x]['R2'])
print(f"\nBest model is {best_model_name} with R2 score of {results[best_model_name]['R2']:.4f}")


Best model is Random Forest with R2 score of 0.8423


**This indicates that 84.23% of the variability of the Price is explained by the features of the model. Furhtermore with a Mean Squared Error od 0.1744, it further confirms that the Random Forest model is the best model for this study**

**However, the XGBoost indicates a good R2 score and low MSE, thus a viable option alongside Random Forest**