# Train - Test Split


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv("/Users/burcusenol/Documents/Machine-Learning-Algorithms/Machine-Learning-Algorithms/Advertising.csv")

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


# Train | Test Split Procedure
* Clean and adjust data as necessary fo X and y
* Split Data in Train/Test for both X and y
* Fit/Train Scaler on Training X Data
* Scale X Test Data
* Create Model
* Fit/Train Model on X Train Data
* Evaluate Model on X Test Data(by creating predictions and comparing to Y_test)
* Adjust Parameters as Necessary and repeat steps 5 and 6

In [5]:
X=df.drop("sales",axis=1)

In [6]:
y=df["sales"]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler=StandardScaler()

In [11]:
scaler.fit(X_train)

StandardScaler()

In [12]:
X_train=scaler.transform(X_train)

In [13]:
X_test=scaler.transform(X_test)

In [14]:
from sklearn.linear_model import Ridge

In [15]:
model=Ridge(alpha=100)

In [16]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [17]:
y_pred=model.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error

In [19]:
mean_squared_error(y_test,y_pred)

7.34177578903413

In [20]:
model_two=Ridge(alpha=1)

In [21]:
model_two.fit(X_train,y_train)

Ridge(alpha=1)

In [22]:
y_pred_two=model_two.predict(X_test)

In [23]:
mean_squared_error(y_test,y_pred_two)

2.3190215794287514

# Train | Validation |Test Split
* Clean and adjust data as necessary fo X and y
* Split Data in Train/Validation/Test for both X and y
* Fit/Train Scaler on Training X Data
* Scale X Eval Data
* Create Model
* Fit/Train Model on X Train Data
* Evaluate Model on X Evaluation Data(by creating predictions and comparing to Y_eval)
* Adjust Parameters as Necessary and repeat steps 5 and 6
* Get final metrics on Test set(not allowed to go back and adjust after this!)

In [24]:
X=df.drop("sales",axis=1)

In [25]:
y=df["sales"]

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_other, y_train, y_other = train_test_split(
X, y, test_size=0.3, random_state=101)

In [28]:
#test_size=0.5 (50% of 30% other --> test=15% of all data)
X_eval,X_test,y_eval,y_test=train_test_split(X_other,y_other,test_size=0.5,random_state=101)

In [29]:
len(df)

200

In [30]:
len(X_train)

140

In [31]:
len(X_eval)

30

In [32]:
len(X_test)

30

In [33]:
scaler=StandardScaler()

In [34]:
scaler.fit(X_train)

StandardScaler()

In [35]:
X_train=scaler.transform(X_train)

In [36]:
X_test=scaler.transform(X_test)

In [37]:
X_eval=scaler.transform(X_eval)

In [38]:
model_one=Ridge(alpha=100)

In [39]:
model_one.fit(X_train,y_train)

Ridge(alpha=100)

In [40]:
y_eval_pred=model.predict(X_eval)

In [41]:
mean_squared_error(y_eval,y_eval_pred)

7.320101458823871

In [42]:
model_two=Ridge(alpha=1)

In [43]:
model_two.fit(X_train,y_train)

Ridge(alpha=1)

In [44]:
new_pred_eval=model_two.predict(X_eval)

In [45]:
mean_squared_error(y_eval,new_pred_eval)

2.3837830750569853

In [46]:
y_final_test_pred=model_two.predict(X_test)

In [47]:
mean_squared_error(y_test,y_final_test_pred)

2.2542600838005176

# Using the cross_val_score function

* The cross_val_score function uses a model and training set(along with a K and chosen metric) to perform all of this for us automatically.
* This allows for K-Fold cross Validation to be performed on any model

In [48]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


![image](./grid_search_cross_validation.png).



In [50]:
X=df.drop("sales",axis=1)

In [51]:
y=df["sales"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [54]:
from sklearn.preprocessing import StandardScaler

In [55]:
scaler=StandardScaler()

In [56]:
scaler.fit(X_train)

StandardScaler()

In [57]:
X_train=scaler.transform(X_train)

In [58]:
X_test=scaler.transform(X_test)

In [59]:
model=Ridge(alpha=100)

In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
scores=cross_val_score(model,X_train,y_train,
scoring="neg_mean_squared_error",cv=5)

In [62]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [63]:
abs(scores.mean())

8.215396464543606

In [64]:
model=Ridge(alpha=1)

In [65]:
scores=cross_val_score(model,X_train,y_train,
scoring="neg_mean_squared_error",cv=5)

In [66]:
abs(scores.mean())

3.344839296530695

In [67]:
model.fit(X_train,y_train)

Ridge(alpha=1)

In [68]:
y_final_test_pred=model.predict(X_test)

In [69]:
mean_squared_error(y_test,y_final_test_pred)

2.3190215794287514

## cross_validate

* The cross_validate function allows us to view multiple performance metrics from cross validaton on model and explore how much time fitting and testing took.

In [70]:
X=df.drop("sales",axis=1)
y=df["sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

sclaer=StandardScaler()
scaler.fit(X_train)

X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)



In [71]:
from sklearn.model_selection import cross_validate

In [72]:
model=Ridge(alpha=100)

In [73]:
scores=cross_validate(model,X_train,y_train,
scoring=["neg_mean_squared_error","neg_mean_absolute_error"],cv=10)

In [74]:
scores

{'fit_time': array([0.00096321, 0.00085402, 0.00133491, 0.00148487, 0.00097203,
        0.00077605, 0.00064516, 0.00062084, 0.00061488, 0.00060487]),
 'score_time': array([0.0005939 , 0.00067115, 0.00114512, 0.00138307, 0.00051212,
        0.00045896, 0.00045085, 0.00043797, 0.00044608, 0.00043392]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [75]:
scores=pd.DataFrame(scores)

In [76]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.000963,0.000594,-6.060671,-1.810212
1,0.000854,0.000671,-10.627031,-2.541958
2,0.001335,0.001145,-3.993426,-1.469594
3,0.001485,0.001383,-5.009494,-1.862769
4,0.000972,0.000512,-9.1418,-2.520697
5,0.000776,0.000459,-13.086256,-2.459995
6,0.000645,0.000451,-3.839405,-1.451971
7,0.000621,0.000438,-9.058786,-2.377395
8,0.000615,0.000446,-9.055457,-2.443344
9,0.000605,0.000434,-5.778882,-1.899797


In [77]:
scores.mean()

fit_time                        0.000887
score_time                      0.000653
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [78]:
model=Ridge(alpha=1)

In [79]:
scores=cross_validate(model,X_train,y_train,
scoring=["neg_mean_squared_error","neg_mean_absolute_error"],cv=10)

In [80]:
scores=pd.DataFrame(scores)

In [82]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001126,0.000555,-2.962508,-1.457174
1,0.00096,0.000476,-3.057378,-1.555308
2,0.000657,0.000427,-2.17374,-1.23877
3,0.000613,0.000413,-0.833034,-0.768938
4,0.000643,0.000424,-3.464018,-1.434489
5,0.000596,0.000412,-8.232647,-1.494316
6,0.00059,0.00039,-1.905864,-1.081362
7,0.00054,0.000388,-2.765048,-1.250011
8,0.000534,0.00038,-4.989505,-1.580971
9,0.000523,0.000397,-2.846438,-1.223326


In [83]:
scores.mean()

fit_time                        0.000678
score_time                      0.000426
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [84]:
model.fit(X_train,y_train)

Ridge(alpha=1)

In [85]:
y_final_pred=model.predict(X_test)

In [86]:
mean_squared_error(y_test,y_final_pred)

2.3190215794287514

# Grid Search

* A grid search is a way of training and validating a model on every possible combination of multiple hyperparameter options.
* This allows for both cross_validation and a grid search to be performed in a genralized way for any model

In [87]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [88]:
X=df.drop("sales",axis=1)
y=df["sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

sclaer=StandardScaler()
scaler.fit(X_train)

X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [89]:
from sklearn.linear_model import ElasticNet

In [90]:
base_elastic_net_model=ElasticNet()

In [91]:
param_grid={"alpha":[0.1,1.5,10,50,100],
"l1_ratio":[.1,.5,.7,.95,.99,1]}

In [92]:
from sklearn.model_selection import GridSearchCV

In [95]:
grid_model=GridSearchCV(estimator=base_elastic_net_model,
 param_grid=param_grid,scoring="neg_mean_squared_error",
 cv=5,verbose=1)

In [96]:
grid_model.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.1, 1.5, 10, 50, 100],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.95, 0.99, 1]},
             scoring='neg_mean_squared_error', verbose=1)

In [97]:
grid_model.best_estimator_

ElasticNet(alpha=0.1, l1_ratio=1)

In [98]:
grid_model.best_params_

{'alpha': 0.1, 'l1_ratio': 1}

In [100]:
pd.DataFrame(grid_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000993,0.00025,0.000293,4.8e-05,0.1,0.1,"{'alpha': 0.1, 'l1_ratio': 0.1}",-3.453021,-1.40519,-5.789125,-2.187302,-4.645576,-3.496043,1.591601,6
1,0.000543,6e-06,0.000228,6e-06,0.1,0.5,"{'alpha': 0.1, 'l1_ratio': 0.5}",-3.32544,-1.427522,-5.59561,-2.163089,-4.451679,-3.392668,1.506827,5
2,0.000533,6e-06,0.000223,5e-06,0.1,0.7,"{'alpha': 0.1, 'l1_ratio': 0.7}",-3.26988,-1.442432,-5.502437,-2.16395,-4.356738,-3.347088,1.462765,4
3,0.000526,5e-06,0.000219,5e-06,0.1,0.95,"{'alpha': 0.1, 'l1_ratio': 0.95}",-3.213052,-1.472417,-5.396258,-2.177452,-4.24108,-3.300052,1.406248,3
4,0.000632,0.000143,0.000244,2.9e-05,0.1,0.99,"{'alpha': 0.1, 'l1_ratio': 0.99}",-3.208124,-1.478489,-5.380242,-2.181097,-4.222968,-3.294184,1.396953,2
5,0.000625,3.5e-05,0.000245,3e-06,0.1,1.0,"{'alpha': 0.1, 'l1_ratio': 1}",-3.206943,-1.480065,-5.376257,-2.182076,-4.21846,-3.29276,1.394613,1
6,0.000659,5.5e-05,0.000246,9e-06,1.5,0.1,"{'alpha': 1.5, 'l1_ratio': 0.1}",-12.859602,-7.539015,-14.606112,-10.28203,-10.293937,-11.116139,2.424341,12
7,0.000575,3.9e-05,0.000233,1.3e-05,1.5,0.5,"{'alpha': 1.5, 'l1_ratio': 0.5}",-12.209001,-6.625225,-13.884804,-9.306878,-8.98328,-10.201837,2.556206,11
8,0.000536,1.4e-05,0.000226,6e-06,1.5,0.7,"{'alpha': 1.5, 'l1_ratio': 0.7}",-11.549347,-5.887566,-13.072439,-8.46982,-8.176765,-9.431187,2.561264,10
9,0.000539,6e-06,0.000223,4e-06,1.5,0.95,"{'alpha': 1.5, 'l1_ratio': 0.95}",-10.413587,-4.59322,-11.616932,-6.968736,-6.861456,-8.090786,2.562512,9


In [101]:
y_pred=grid_model.predict(X_test)

In [102]:
mean_squared_error(y_test,y_pred)

2.3873426420874737