In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold,RepeatedKFold, LeaveOneOut, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
df = sns.load_dataset('tips')
y_col = 'tip'
X = df.drop(columns=[y_col])

numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
X = pd.get_dummies(df, drop_first=True)
y = df[y_col]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [7]:
print(len(X_train))
print(len(X_test))

170
74


In [8]:
scaler = StandardScaler()
X_train_scaled = X_train.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = X_test.copy()
X_test_scaled[numeric_cols] = scaler.fit_transform(X_test[numeric_cols])

In [9]:
model = LinearRegression()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)

In [10]:
mse = mean_squared_error(y_test, y_pred)
mse

1.1786274842364611e-30

## LOOCV

In [12]:
X = df['total_bill'].values.reshape(-1,1)
y = df['tip']
loo =  LeaveOneOut()
list(loo.split(X))

[(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
          27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
          40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
          53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
          66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
          79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
          92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
         105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
         118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
         131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
         144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
         157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
         170, 171, 172, 173, 174, 175,

In [13]:
model = LinearRegression()
mse_list = []
for train_idx,test_idx in loo.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train,y_test = y[train_idx], y[test_idx]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    mse_list.append(mse)

In [14]:
mse_list

[2.897838258616484,
 0.12199147113328179,
 0.14042273022222176,
 0.009550694936171571,
 0.011609678564894612,
 1.2998883775715193,
 0.025706592492901047,
 0.39380695083631817,
 0.2945202967072801,
 0.5800044496603817,
 0.08493502852157918,
 0.14660920197932653,
 0.9500539483266901,
 0.020948372820339643,
 0.2971844385557556,
 0.5423476873486901,
 0.11433091165193825,
 1.175081134557344,
 0.6417247888928548,
 0.0686745374046344,
 1.6465559530878566,
 0.09148244748691699,
 0.12126066578074082,
 6.666352057083047,
 0.031997184135237944,
 0.20494035785394718,
 0.10659134805438578,
 0.06489287202120543,
 1.221995691465723,
 0.00025806730123078213,
 0.22830383271644533,
 0.12175698553699285,
 0.25069432748604326,
 0.41718865336384836,
 0.23472210330905421,
 0.023597808760119673,
 0.4047836824072319,
 0.1393928872163391,
 0.3312838690001625,
 0.647090343417676,
 0.13441946379811676,
 0.04619875835547508,
 0.4619653287847869,
 0.38783229608135605,
 2.2557831144302423,
 0.02544159100320264,
 3.

In [15]:
print(f'MSE(LOOCV): {np.mean(mse_list)}')
print(f'std: {np.std(mse)}')

MSE(LOOCV): 1.0675673489857436
std: 0.0


In [17]:
cv = LeaveOneOut()
scores = cross_val_score(model,X,y,cv=cv,scoring='neg_mean_squared_error')
print(f'MSE(LOOCV): {-np.mean(scores)}')
print(f'std: {np.std(scores)}')


MSE(LOOCV): 1.0675673489857436
std: 2.099794455177631


## KFoldcv

In [20]:
mse_list = []
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
for train_idx, test_idx in cv.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    mse_list.append(mse)


In [21]:
mse_list

[0.8213090642766288,
 1.0745842125927976,
 1.0880123892600384,
 1.3323867714930204,
 1.084763004349474]

In [22]:
print(f'MSE{k}FoldCV: {np.mean(mse_list)}')
print(f'std: {np.std(mse_list)}')

MSE5FoldCV: 1.0802110883943918
std: 0.16170100507039512


In [23]:
scores = cross_val_score(model,X,y,cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

In [24]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

In [25]:
print(f'MSE{k}FoldCV: {np.mean(scores)}')
print(f'std: {np.std(scores)}')

MSE5FoldCV: -1.0802110883943918
std: 0.16170100507039514


## RepeatedKFold

In [27]:
mse_list = []
k = 5
n_repeats = 3
cv = RepeatedKFold(n_splits=k,n_repeats=n_repeats,random_state=0)
model = LinearRegression()

for train_idx, test_idx in cv.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    mse_list.append(mse)


In [30]:
print(f'MSE({k}FoldCV): {np.mean(mse_list)}')
print(f'std: {np.std(mse_list)}')

MSE(5FoldCV): 1.0746387233165984
std: 0.26517178540898434


## Pipeline

In [32]:
from sklearn.pipeline import make_pipeline, Pipeline

In [35]:
Pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])

In [37]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(Pipeline,X,y,scoring='neg_mean_squared_error',cv=cv)
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [40]:
model = LinearRegression()
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test) 
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_scaled_test)



ValueError: X has 1 features, but LinearRegression is expecting 9 features as input.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
Pipeline = Pipeline(steps= [('scaler', StandardScaler()), ('model', LinearRegression())])
Pipeline.fit(X_train,y_train)
y_pred = Pipeline.predict(X_test)