In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import KFold

In [None]:
data = pd.read_csv("Ecommerce.csv")
data.head()

In [None]:
data_num = data[['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
Y = data[['Yearly Amount Spent']]

In [None]:
X, X_test, Y, Y_test = train_test_split(data_num, data[['Yearly Amount Spent']], test_size=0.2)

In [None]:
class K_Fold():
    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        if not self.shuffle and self.random_state is not None: 
             raise ValueError("Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.")
    
    def split(self, X):
        n_samples = len(X)
        if self.n_splits > n_samples:
             raise ValueError(f"Cannot have number of splits n_splits={self.n_splits} greater than the number of samples: n_samples={n_samples}.")
        indices = np.arange(n_samples)
        if self.shuffle == True:
            np.random.seed(self.random_state)
            np.random.shuffle(indices)
        result = []
        r = int(n_samples%self.n_splits)
        d = int(n_samples//self.n_splits)
        start = 0
        if r>0:
            end = d + 1
            r -= 1
        else:
            end = d
        while self.n_splits > 0 and end != n_samples-1:
            result.append((np.append(indices[:start], indices[end:]), indices[start:end]))
            self.n_splits -= 1
            start = end
            if r > 0:
                end += d + 1
                r -= 1
            else:
                end += d
        return result

Evaluating model with K-fold

In [None]:
scores = []
reg = linear_model.LinearRegression()
cv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in cv.split(X):
    X_train, X_val, y_train, y_val = X.iloc[train_index], X.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
    reg.fit(X_train, y_train)
    scores.append(reg.score(X_val, y_val))

In [None]:
scores = np.mean(scores)
scores

In [None]:
reg.fit(X, Y)
reg.score(X_test, Y_test)

In [None]:
scores = []
reg = linear_model.LinearRegression()
my_cv = K_Fold(n_splits=10, shuffle=True)
for train_index, test_index in my_cv.split(X):
    X_train, X_val, y_train, y_val = X.iloc[train_index], X.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
    reg.fit(X_train, y_train)
    scores.append(reg.score(X_val, y_val))

In [None]:
scores = np.mean(scores)
scores

In [None]:
reg.fit(X, Y)
reg.score(X_test, Y_test)