In [1]:
import pandas as pd
import numpy as np

### get and preprocess Data

In [2]:
# read abalone dataset
df = pd.read_csv("abalone.data", names=["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"])

In [3]:
# data preprocessing:
# nominal sex attribute to one-hot ordinal attributes
df["Male"] = [1 if x=="M" else 0 for x in df["Sex"]]
df["Female"] = [1 if x=="F" else 0 for x in df["Sex"]]
df["Infant"] = [1 if x=="I" else 0 for x in df["Sex"]]
del df["Sex"]
# get numpy array y=Age from data 
y = df["Rings"] + 1.5
y = np.array(y)
del df["Rings"]
# store training data as numpy array
X = np.array(df)

### ordinary least squares functions

In [4]:
# ordinary least squares from the lecture
def OLS(X,y):
    return np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)

In [5]:
# compute mean squared error 
def mean_squared_error(w, X, y):
    y_hat = np.matmul(X, w)
    return np.sum(np.power(y-y_hat,2))/len(y)

### cross validation functions

In [6]:
# split training and test data for cross validation
def cv_split(X,y,k,data_part):
    test_X = X[int(data_part*len(X)/k):int((data_part + 1)*len(X)/k)]
    test_y = y[int(data_part*len(y)/k):int((data_part + 1)*len(y)/k)]
    if data_part > 0 and data_part < k-1:
        train_X = np.concatenate((X[:int(data_part*len(X)/k)],X[int((data_part + 1)*len(X)/k):]))
        train_y = np.concatenate((y[:int(data_part*len(y)/k)],y[int((data_part + 1)*len(y)/k):]))
    elif data_part == 0:
        train_X = X[int((data_part + 1)*len(X)/k):]
        train_y = y[int((data_part + 1)*len(y)/k):]
    elif data_part == k-1:
        train_X = X[:int(data_part*len(X)/k)]
        train_y = y[:int(data_part*len(y)/k)]
    return train_X, train_y, test_X, test_y

In [7]:
# cross validation
def cross_validation(k, X, y, train_function, loss_function):
    assert len(X) == len(y), "Attribute matrix and age vector must have the same size\n" # DEBUG
    assert callable(train_function), "train_function must be a function which trains the model\n" # DEBUG
    assert callable(loss_function), "loss_function must be a function which computes the loss of the model\n" # DEBUG
    
    mean_loss = 0
    for data_part in range(k):
        # split training and test data
        train_X, train_y, test_X, test_y = cv_split(X, y, k, data_part) 
        # train model
        hypothesis = train_function(train_X, train_y)
        # test model
        loss = loss_function(hypothesis, test_X, test_y)
        print("loss for split " + str(data_part + 1) + " is " + str(loss))
        mean_loss += loss/k
    print("mean loss is " + str(mean_loss))

### test programm with 10-fold cross validation

In [8]:
k = 10
cross_validation(k, X, y, OLS, mean_squared_error)

loss for split 1 is 6.624242254023692
loss for split 2 is 11.619128634853663
loss for split 3 is 2.941502616152837
loss for split 4 is 2.7061406629099105
loss for split 5 is 4.029795139704605
loss for split 6 is 8.45006870865274
loss for split 7 is 2.4186703082904057
loss for split 8 is 5.1465175470661535
loss for split 9 is 3.6215640786669105
loss for split 10 is 4.26266147999485
mean loss is 5.182029143031577
