# Simple train test split

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('C:\\Users\\User\\jupyter-noutbooks\\DATA\\Advertising.csv')

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [5]:
X = df.drop('sales',axis=1)

In [6]:
X

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [7]:
y = df['sales']

In [8]:
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# train test split data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [11]:
# scale data
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
scaler.fit(X_train)

StandardScaler()

In [14]:
x_train_scaled = scaler.transform(X_train)

In [15]:
X_test_scaled = scaler.transform(X_test)

In [16]:
# Creating model part
from sklearn.linear_model import Ridge

In [17]:
model = Ridge(alpha=100)

In [18]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [19]:
y_pred = model.predict(X_test)

In [20]:
# Measure error metrics
from sklearn.metrics import mean_squared_error

In [21]:
mse = mean_squared_error(y_test,y_pred)

# Train Validate Test split

In [22]:
# Train validate test for true metrics
df = pd.read_csv('C:\\Users\\User\\jupyter-noutbooks\\DATA\\Advertising.csv')

In [23]:
X_train,X_other,y_train,y_other = train_test_split(X,y,test_size=0.3,random_state=42)

In [24]:
X_val,X_test,y_val,y_test = train_test_split(X_other,y_other,test_size=0.5,random_state=42)

In [25]:
len(df)

200

In [26]:
len(X_train)

140

In [27]:
len(X_val)

30

In [28]:
len(X_test)

30

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
scaler = StandardScaler()

In [31]:
scaler.fit(X_train)

StandardScaler()

In [32]:
X_train_scaled = scaler.transform(X_train)

In [33]:
X_test_scaled = scaler.transform(X_test)

In [34]:
X_val_scaled = scaler.transform(X_val)

In [35]:
from sklearn.linear_model import Ridge

In [36]:
model = Ridge(alpha=100)

In [37]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [38]:
y_val_predictions = model.predict(X_val)

In [39]:
from sklearn.metrics import mean_squared_error

In [40]:
mean_squared_error(y_val,y_val_predictions)

2.541031529267014

In [41]:
true_predictions = model.predict(X_test)

In [42]:
mean_squared_error(y_test,true_predictions)

5.031779528956173

# K-fold Cross validation with sckit-learn

In [43]:
df = pd.read_csv('C:\\Users\\User\\jupyter-noutbooks\\DATA\\Advertising.csv')

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [45]:
scaler = StandardScaler()

In [46]:
scaler.fit(X_train)

StandardScaler()

In [47]:
X_train_scaled = scaler.transform(X_train)

In [48]:
X_test_scaled = scaler.transform(X_test)

In [49]:
from sklearn.model_selection import cross_val_score

In [50]:
model = Ridge(alpha=100)

In [51]:
scores = cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [52]:
abs(scores.mean())

2.769926167820609

In [53]:
scores

array([-2.77256718, -1.56959186, -2.43645543, -2.21767666, -4.85333971])

In [54]:
model_2 = Ridge(alpha=1)

In [55]:
scores1 = cross_val_score(model_2,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [56]:
scores1

array([-2.77309694, -1.57706489, -2.41062918, -2.21679814, -4.87376048])

In [57]:
abs(scores1.mean())

2.770269927348527

In [58]:
model_2.fit(X_train,y_train)

Ridge(alpha=1)

In [59]:
y_final_pred = model_2.predict(X_test)

In [60]:
mean_squared_error(y_test,y_final_pred)

3.796691873092173

# Grid Search

In [61]:
df = pd.read_csv('C:\\Users\\User\\jupyter-noutbooks\\DATA\\Advertising.csv')

In [62]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
from sklearn.linear_model import ElasticNet

In [66]:
base_model = ElasticNet()

In [71]:
param_grid = {'alpha':[0.1,1,5,10,50,100],
              'l1_ratio':[.1,.5,.7,.95,.99,1]}

In [68]:
from sklearn.model_selection import GridSearchCV

In [72]:
grid_model = GridSearchCV(estimator=base_model,
                          param_grid=param_grid,
                         scoring='neg_mean_squared_error',
                         cv=5)

In [73]:
grid_model.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.1, 1, 5, 10, 50, 100],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.95, 0.99, 1]},
             scoring='neg_mean_squared_error')

In [74]:
grid_model.best_estimator_

ElasticNet(alpha=0.1, l1_ratio=1)

In [75]:
grid_model.best_params_

{'alpha': 0.1, 'l1_ratio': 1}

In [76]:
y_pred = grid_model.predict(X_test)

In [77]:
mean_squared_error(y_test,y_pred)

2.3873426420874737