In [91]:
#importing all neccesities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

#making data useable
data = pd.read_csv("car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=4))])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("door", door_transformer, door_feature),
        ("num", numeric_transformer, numeric_features)])
#fitting it to a model
model = Pipeline(steps=[("preprocessor", preprocessor),
                 ("model", RandomForestRegressor(n_jobs=-1))])
X = data.drop("Price", axis=1)
y = data["Price"]

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)


model.fit(X_train, y_train)

                                           
                                          


In [117]:
#base model score
model.score(X_test, y_test)

score : 0.2927872358019916
MSE : 59738742.73947007
MAE : 6145.343478618422
R^2: 0.20234207803323734


In [92]:
#using gridsearch cv to improve the accuracy
pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["sqrt"],
    "model__min_samples_split": [2,4]
}
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   2.8s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_sampl

[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.7s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.7s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strat

In [114]:
gs_model.best_params_  #gets the best parameters used for better accuracy

{'model__max_depth': 5,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 4,
 'model__n_estimators': 100,
 'preprocessor__num__imputer__strategy': 'mean'}

In [130]:
# Creating a score function and scoring the model updated with suitable parameters for better accuracy overall
def scoring(y_test : np.array,
           y_pred : np.array) -> dict :
    score = gs_model.score(X_test, y_test)
    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test, y_pred)
    metric_dict = {"score":round(score,3),
        "MAE": round(MAE, 3),
                  "MSE": round(MSE, 3),
                  "r2": round(r2, 3)}
    print(f"score : {score}\nMSE : {MSE}\nMAE : {MAE}\nR^2: {r2}")
    
    return metric_dict

scoring(y_test, y_pred)

score : 0.2927872358019916
MSE : 52965062.113749996
MAE : 5849.985169875542
R^2: 0.2927872358019916


{'score': 0.293, 'MAE': 5849.985, 'MSE': 52965062.114, 'r2': 0.293}