In [327]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [328]:
# Read in the file
data = pd.read_csv("Fish.csv")

In [329]:
# Take a look at the counts of different species
data

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [330]:
# Observe the correlation using seaborn pairplot
#fig = sns.pairplot(data)
#fig.savefig("./pair_plot.png", dpi = 1000)

In [331]:
# We will make use of the species for regression
# by one-hot encoding.
# Bream: 001, Roach: 010, Whitefish: 011, Parkki: 100, Perch: 101, Pike: 110, Smelt: 111
types = {
    "Bream": 0b00100,
    "Roach": 0b01000,
    "Whitefish": 0b01100,
    "Parkki": 0b10000,
    "Perch": 0b10000,
    "Pike": 0b11000,
    "Smelt": 0b11100,
}
types = {
    "Bream": 0,
    "Roach": 1,
    "Whitefish": 2,
    "Parkki": 3,
    "Perch": 4,
    "Pike": 5,
    "Smelt": 6,
}

def get_type(name):
    return types.get(name, "Invalid Season")

s_type = []

for index, row in data.iterrows():
    species = row['Species']
    s_type.append(get_type(species))

data['SpeciesCode'] = s_type

data

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,SpeciesCode
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200,4
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056,4
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961,4
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555,4
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340,4
...,...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936,28
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690,28
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558,28
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672,28


In [332]:
# Read in the random train/test split we did for you.
import pickle as pkl
with open('experiments.pkl', 'rb') as f:
    indices = pkl.load(f)
# train/test split with first 80% indices as training sample

In [333]:
# use the rest of the features to predict the Weight of the fish
data['Index'] = indices
data = data.sort_values("Index", ascending=True)
train_count = round(data.shape[0] * 0.8)
test_count = data.shape[0] - train_count
train_set = data.head(train_count)
test_set = data.tail(test_count)
train_set

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,SpeciesCode,Index
42,Roach,120.0,19.4,21.0,23.7,6.1146,3.2943,8,0
26,Bream,720.0,32.0,35.0,40.6,16.3618,6.0900,4,1
39,Roach,120.0,18.6,20.0,22.2,6.2160,3.5742,8,2
139,Pike,770.0,44.8,48.0,51.2,7.6800,5.3760,24,3
143,Pike,1550.0,56.0,60.0,64.0,9.6000,6.1440,24,4
...,...,...,...,...,...,...,...,...,...
25,Bream,725.0,31.8,35.0,40.9,16.3600,6.0532,4,122
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555,4,123
122,Perch,820.0,37.1,40.0,42.5,11.1350,6.6300,16,124
56,Whitefish,270.0,24.1,26.5,29.3,8.1454,4.2485,12,125


In [334]:
# standardize the data using the training set
# no sklearn method allowed here

import copy

train_mean = []
train_std = []
# col_names = ["Length1", "Length2", "Length3", "Height", "Width", "SpeciesCode"]
col_names = ["Length1", "Length2", "Length3", "Height", "Width"]


def get_mean_and_std(col):
    mean = col.mean(axis=0)
    std = col.std(axis=0)
    return mean, std

def standardize_col(col, mean, std):
    standardized_col = []
    for cell in col:
        standardized_col.append((cell - mean) / std)
    return standardized_col


def standardize_sets(trains, tests):
    for col_name in col_names:
        mean, std = get_mean_and_std(trains[col_name])
        trains[col_name] = standardize_col(trains[col_name], mean, std)
        tests[col_name] = standardize_col(tests[col_name], mean, std)


standardized_train_set = copy.deepcopy(train_set)
standardized_test_set = copy.deepcopy(test_set)
standardize_sets(standardized_train_set, standardized_test_set)

print(standardized_train_set)
print(train_set)

       Species  Weight   Length1   Length2   Length3    Height     Width  \
42       Roach   120.0 -0.669864 -0.676812 -0.636856 -0.655727 -0.638593   
26       Bream   720.0  0.538474  0.574355  0.758778  1.659945  0.974855   
39       Roach   120.0 -0.746584 -0.766181 -0.760729 -0.632812 -0.477057   
139       Pike   770.0  1.765991  1.736153  1.634146 -0.301976  0.562793   
143       Pike  1550.0  2.840069  2.808581  2.691193  0.131907  1.006020   
..         ...     ...       ...       ...       ...       ...       ...   
25       Bream   725.0  0.519294  0.574355  0.783552  1.659538  0.953617   
3        Bream   363.0 -0.008155  0.038140  0.172447  0.839227  0.031557   
122      Perch   820.0  1.027563  1.021200  0.915683  0.478788  1.286499   
56   Whitefish   270.0 -0.219135 -0.185282 -0.174397 -0.196805 -0.087907   
45       Roach   160.0 -0.564374 -0.542759 -0.504725 -0.448096 -0.335029   

     SpeciesCode  Index  
42             8      0  
26             4      1  
39       

In [335]:
# use the linear_regression model to solve this problem
# from linear_regression import LinearReg
class LinearReg(object):
    def __init__(self, indim=1, outdim=1):
        # initialize the parameters first.
        self.W = np.zeros(shape=(indim+1, outdim))

    def fit(self, X, T):
        # implement the .fit() using the simple least-square closed-form solution:
        X = np.hstack([X, np.ones(shape=[X.shape[0], 1])])
        self.W = np.linalg.inv(X.T@X) @ X.T @ T
        # HINT:
        #   extend the input features before fitting to it.
        #   compute the weight matrix of shape [indim+1, outdim]

    def predict(self, X):
        # implement the .predict() using the parameters learned by .fit()
        X = np.hstack([X, np.ones(shape=[X.shape[0], 1])])
        return X @ self.W

# create the model
model = LinearReg()
model2 = LinearReg()

std_param = standardized_train_set.iloc[:, 2:7]
std_target = standardized_train_set.iloc[:, 1:2]

std_test_param = standardized_test_set.iloc[:, 2:7]
std_test_target = standardized_test_set.iloc[:, 1:2]

# fit the model
model.fit(std_param, std_target)
# predict
predict_std_train = model.predict(std_param)
predict_std_test = model.predict(std_test_param)

# get_mse_loss(W, std_param, std_target)
def get_mse_loss(p, t):
    loss_sum = []
    p_val = np.array(p).T
    t_val = np.array(t).T
    for i in range(0, p_val.shape[0]):
        loss_sum.append(pow(p_val[i] - t_val[i], 2))
    return np.mean(loss_sum)

# compute the loss on the training set
print("loss for standardized train_set is: ")
print(get_mse_loss(predict_std_train, std_target))
# compute the loss on the test set
print("loss for standardized test_set is: ")
print(get_mse_loss(predict_std_test, std_test_target))
# report the losses

loss for standardized train_set is: 
14842.877511737306
loss for standardized test_set is: 
14522.561541809704


In [336]:
# Bonus: Further Analysis
# Can you further analyze the result yielded and make improvement?