In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../DATA/Advertising.csv')

In [4]:
X = df.drop('sales', axis=1)
y = df['sales']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
model = LinearRegression()

In [9]:
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
mean_squared_error(y_test, y_pred)

3.79679723671522

In [13]:
from sklearn.tree import DecisionTreeRegressor

In [14]:
model = DecisionTreeRegressor()

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
mean_squared_error(y_test, y_pred)

1.763833333333334

In [19]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [20]:
scaler = StandardScaler()

In [21]:
X_std_scaler = scaler.fit_transform(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_std_scaler, y, test_size=0.3, random_state=42)

In [23]:
model = DecisionTreeRegressor()

In [24]:
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
mean_squared_error(y_test, y_pred)

1.4945000000000013

In [27]:
X

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [28]:
scaler = MinMaxScaler()

In [29]:
X_minmax_scaler = scaler.fit_transform(X)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_minmax_scaler, y, test_size=0.3, random_state=42)

In [31]:
model = DecisionTreeRegressor()

In [32]:
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
mean_squared_error(y_test, y_pred)

2.0420000000000003

# no pipeline

In [48]:
df = pd.read_csv('../DATA/Advertising.csv')

In [49]:
X = df.drop('sales', axis=1)
y = df['sales']

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
from sklearn.preprocessing import StandardScaler

In [52]:
scaler = StandardScaler()

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [54]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
model = DecisionTreeRegressor()

In [57]:
model.fit(X_train, y_train)

In [58]:
y_pred = model.predict(X_test)

In [59]:
mean_squared_error(y_test, y_pred)

1.6856666666666669

In [60]:
new_data = np.array([[230, 78, 63]])

In [63]:
new_data = scaler.transform(new_data)



In [65]:
new_data

array([[0.88051455, 3.73545887, 1.57831749]])

In [64]:
model.predict(new_data)

array([25.4])

# pipeline

In [66]:
df = pd.read_csv('../DATA/Advertising.csv')

In [67]:
X = df.drop('sales', axis=1)
y = df['sales']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [69]:
model = DecisionTreeRegressor()
scaler = StandardScaler()

In [70]:
from sklearn.pipeline import Pipeline

In [71]:
pipe = Pipeline([('scaler', scaler), ('model', model)])

In [72]:
pipe.fit(X_train, y_train)

In [73]:
y_pred = pipe.predict(X_test)

In [74]:
mean_squared_error(y_test, y_pred)

1.8668333333333338

In [75]:
new_data = np.array([[230, 175, 68]])

In [76]:
pipe.predict(new_data)



array([25.4])

# pipeline + gridsearch + cross validation

In [77]:
df = pd.read_csv('../DATA/Advertising.csv')

In [78]:
X = df.drop('sales', axis=1)
y = df['sales']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [85]:
model = DecisionTreeRegressor()
scaler = StandardScaler()

In [86]:
pipe = Pipeline([('scaler', scaler), ('model', model)])

In [91]:
model_params = {
    "model__criterion" : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "model__splitter" : ["best", "random"],
    "model__max_depth": [None, 2, 6, 10]
}

In [92]:
from sklearn.model_selection import GridSearchCV

In [93]:
final_model = GridSearchCV(pipe, param_grid=model_params, cv=5, verbose=2)

In [94]:
final_model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=best; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=best; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=best; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=best; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=best; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=random; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=random; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=random; total time=   0.0s
[CV] END model__criterion=squared_error, model__max_depth=None, model__splitter=random; tota

In [95]:
final_model.best_score_

0.9368280615912326

In [96]:
final_model.best_estimator_