### regression algo
   - simple linear regression
   - multiple
   - polynomial
   - decision tree
   - random forest
   - Support Vector Regression
### performance -> step wise modelling
   - OLS -> check p-value
   - Correlation between feature (X) and output(Y)
   - cross validation & grid search

## Grid Search

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import pickle

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_csv('../../datasets/diamond_pricing.csv')
df

Unnamed: 0,Paleonium,Pressure,Price
0,17,6567,2810.280298
1,59,5253,1986.967089
2,123,9715,2083.132087
3,182,2073,2236.340285
4,133,6400,1903.323339
...,...,...,...
995,90,15039,1633.608018
996,49,10592,2316.189728
997,42,18435,1189.359047
998,10,13675,2922.682785


In [5]:
x = df[['Paleonium','Pressure']]
y = df['Price']

In [7]:
trainx, testx, trainy, testy = train_test_split(x,y,test_size=.2, random_state=0)

In [8]:
model1 = LinearRegression()
model1.fit(trainx,trainy)
model1.score(testx,testy)* 100

11.918023623707574

In [9]:
model2 = RandomForestRegressor()
model2.fit(trainx,trainy)
model2.score(testx,testy)* 100

98.29244257304369

## checking the model for performance using cross validation


In [10]:
from sklearn.model_selection import cross_val_score

In [12]:
scores = cross_val_score(model2,x,y,cv=4)
scores

array([0.98269939, 0.97924785, 0.98286363, 0.9805104 ])

## testing cross validation on another model

In [17]:
from sklearn.svm import SVR

In [40]:
df = pd.read_csv('../../datasets/house_pricing.csv')
X = df[['Beds','Baths','SquareFeet']]
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
y = df['Price']
trainx, testx, trainy, testy = train_test_split(scaledX,y,test_size=.2, random_state=0)
model3 = RandomForestRegressor()
model3.fit(trainx,trainy)
print('score:',model3.score(testx,testy) * 100)
scores = cross_val_score(model3,scaledX,y,cv=5)
print('cross validation:',scores)
print('validation score',scores.mean() * 100)

score: 82.28894886681472
cross validation: [0.79750838 0.72761543 0.67354716 0.49552279 0.62474557]
validation score 66.37878664090478


In [None]:
df = pd.read_csv('../../datasets/house_pricing.csv')
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']

In [29]:
df = pd.read_csv('../../datasets/house_pricing.csv')
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
trainx, testx, trainy, testy = train_test_split(X,y,test_size=.2,random_state=0)
model4 = SVR(kernel='linear')
model4.fit(trainx,trainy)
print('score: ', model4.score(testx,testy)*100)
scores = cross_val_score(model4,X,y,cv=5)
print('cross validation: ',scores)
print('validated score: ',scores.mean()*100)

score:  88.2129083768925
cross validation:  [0.8977449  0.75262472 0.71825867 0.62053769 0.52122559]
validated score:  70.20783152313051


## Grid Search for model preformance tuning

In [25]:
from sklearn.model_selection import GridSearchCV

In [None]:
# GRID SEARCH will create multiple combination for the model parameters
# RandomForestRegressor(n_estimators=10,criterion='mse',max_depth=5)
# RandomForestRegressor(n_estimators=10,criterion='mse',max_depth=10)
# RandomForestRegressor(n_estimators=10,criterion='mse',max_depth=25)
# RandomForestRegressor(n_estimators=10,criterion='mae',max_depth=5)
# RandomForestRegressor(n_estimators=10,criterion='mae',max_depth=10)
# RandomForestRegressor(n_estimators=10,criterion='mae',max_depth=25)
# RandomForestRegressor(n_estimators=50,criterion='mse',max_depth=5)
# RandomForestRegressor(n_estimators=50,criterion='mse',max_depth=10)
# RandomForestRegressor(n_estimators=50,criterion='mse',max_depth=25)

In [52]:
model_grid_options = {
    'n_estimators' : [10, 50, 100, 200, 500],
    'criterion' : ['mse','mae'],
    'max_depth' : [5, 10, 25]
}

In [53]:
#preforming grid search on model3(RamdomForestRegression model)
grid = GridSearchCV(model3, model_grid_options, cv=3, n_jobs=-1)   

In [54]:
grid.fit(X,y)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [5, 10, 25],
                         'n_estimators': [10, 50, 100, 200, 500]})

In [55]:
grid.cv_results_

{'mean_fit_time': array([0.06455358, 0.23638256, 0.47015119, 1.18137089, 2.42587741,
        0.06398741, 0.26284607, 0.45779991, 0.91879972, 2.30726846,
        0.05536596, 0.24844797, 0.50433183, 0.99726605, 2.47704387,
        0.10403538, 0.50308609, 1.00652687, 1.97827101, 4.91996606,
        0.13474735, 0.66224162, 1.33144403, 2.64854153, 6.92456253,
        0.15097562, 0.72792959, 1.5165422 , 2.91097752, 7.26914295]),
 'std_fit_time': array([0.00359577, 0.00935498, 0.00731334, 0.08814758, 0.31796104,
        0.01176822, 0.03455176, 0.00637714, 0.01127063, 0.00237882,
        0.00238248, 0.00443985, 0.00341058, 0.00473243, 0.02145381,
        0.00311514, 0.01145711, 0.01046742, 0.01104204, 0.05354218,
        0.00451103, 0.00257424, 0.03321079, 0.01827388, 0.17921908,
        0.00306119, 0.00566196, 0.08439895, 0.02885117, 0.09033878]),
 'mean_score_time': array([0.00871301, 0.01813491, 0.0259428 , 0.06693506, 0.1055402 ,
        0.01375842, 0.02055772, 0.02892081, 0.05273779, 0.12

In [56]:
grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [57]:
grid.cv_results_.get('rank_test_score')

array([10,  5,  8,  6,  7, 30, 20, 15, 14, 17, 29, 27, 26, 21, 25,  9,  1,
        4,  3,  2, 24, 16, 13, 11, 12, 28, 19, 23, 22, 18])

In [58]:
grid.best_estimator_

RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=50)

In [61]:
final_model = RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=50)
final_model.fit(trainx,trainy)

RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=50)

In [62]:
final_model.score(testx,testy)*100

88.37713516492529