#Regression Model selection 


# For Small Datasets

In [1]:
# Required packages! 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


In [2]:
# importing the dataset
ds = pd.read_csv("Salary_Data.csv")  # change file name of any preprocessed file
X = ds.iloc[:, :-1].values
y = ds.iloc[:, -1].values

In [3]:
# for Support Vector machines or whose ever model which required feature scaling.
y1 = ds.iloc[:,-1].values
y1 = y1.reshape(len(y1), 1)


In [4]:
# scaling for support vector
sc_x = StandardScaler()
sc_y = StandardScaler()

SX = sc_x.fit_transform(X)
SY = sc_y.fit_transform(y1)


In [5]:
# for polynomial Regression
x_poly = PolynomialFeatures(degree = 2)  # degree can be change
X_poly = x_poly.fit_transform(X)
SX_poly = x_poly.fit_transform(SX)

# Without Feature Scaling

In [None]:
# NOW applying GridSearchCV for Hyperparameter tuning and selecting the best model.
model_params = {
        "Random_Forest_Reg":{"model":RandomForestRegressor(),
                        "params":{"n_estimators":[10,20,30],
                                  "criterion":["mse", "mae"]}
                        },
        "Linear_Reg":{"model":LinearRegression(),
                               "params":{"normalize":[True,False]}
                               },
        "DecisionTree_Reg":{"model":DecisionTreeRegressor(),
                        "params":{"criterion":["mse", "friedman_mse", "mae"]}
                        }
                } 
# for  svc
model_params1 = {
        "svr":{"model":SVR(),
                "params":{"kernel":['linear', 'poly', 'rbf'],
                          "C":[1.0, 3.0, 6.0, 10.0]}
                        }}

# for Polynomial Regression
model_params2 = {
        "Poly_Reg":{"model":LinearRegression(),
                "params":{"normalize":[True, False]}
                        }}

# To run model_params we need for loop.
score = []
for model_name, mp in model_params.items():
    classifier = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier.fit(X, y)
    score.append({"model":model_name,
                  "best_score":classifier.best_score_,
                  "best_params" : classifier.best_params_})
    
# for svc
for model_name, mp in model_params1.items():
    classifier = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier.fit(SX, SY)
    score.append({"model":model_name,
                  "best_score":classifier.best_score_,
                  "best_params" : classifier.best_params_})

# for polynomial regression 
for model_name, mp in model_params2.items():
    classifier = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier.fit(X_poly, y)
    score.append({"model":model_name,
                  "best_score":classifier.best_score_,
                  "best_params" : classifier.best_params_})


df = pd.DataFrame(score, columns = ["model", "best_score", "best_params"])



# Shows the Best model

In [None]:

df[["model", "best_params","best_score"]][df.best_score == df.best_score.max()]


# Shows all model performances.

In [None]:
df  

#With Feature Scaling! 

In [None]:
# for all other models
score1 = []
for model_name, mp in model_params.items():
    classifier1 = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier1.fit(SX, SY)
    score1.append({"model":model_name,
                  "best_score":classifier1.best_score_,
                  "best_params" : classifier1.best_params_})
    
# for polynomial regression    
for model_name, mp in model_params2.items():
    classifier1 = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier1.fit(SX_poly, SY)
    score1.append({"model":model_name,
                  "best_score":classifier1.best_score_,
                  "best_params" :classifier1.best_params_})
df1 = pd.DataFrame(score1, columns = ["model", "best_score", "best_params"])


# Shows the Best Models

In [None]:

df1[["model", "best_params","best_score"]][df1.best_score == df1.best_score.max()]


# show other models performances

In [None]:
df1

In [None]:
cross_val_score(RandomForestRegressor(), SX, SY).mean()