In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fileName = "Datasets/Dataset_3.csv"

def cleanup():
    data = pd.read_csv(fileName, header=None)
    # Remove first 5 columns as they are non-predictive
    data = data.drop(range(5), axis=1)
    
    naColIndex = [col for col in data.columns[data.isnull().any()]] # list of column indexes with missing values
    # Filling missing values with random values between (mean-std, mean+std) because simply filling it with the mean would have too many similar values and would create skew
    for col in naColIndex:
        mean = data[col].mean()
        std = data[col].std()
        if (mean-std < 0):
            randomList = np.random.uniform(0, mean+std, size=data[col].isnull().sum()) # values are all positive
        else:
            randomList = np.random.uniform(mean-std, mean+std, size=data[col].isnull().sum())

        data[col][np.isnan(data[col])] = randomList
    
    return data

In [None]:
data = cleanup()
data.to_csv("Datasets/Dataset_3_complete.csv") # Turning in complete data set

def addOnes(data):
    data.insert(0,'',1)
    data.column = [np.arange(0, data.shape[1])]
    return data

In [None]:
# Using functions from Q2 and slightly modifying them

def predictedY(row, weights):
    return sum([row[i]*w for i,w in enumerate(weights)])

def updateWeights(row, y, w, a): # a is learning rate
    pred_y = predictedY(row, w)
    w = [wi - a*(pred_y-y)*row[i] for i,wi in enumerate(w)]
    return w

def epoch(x, y, w, a):
    for i in range(len(x)):
        w = updateWeights(x[i], y[i], w, a)
    
    return w

def calcMse(pred_y, y):
    error = 0
    for i in range(len(y)):
        error += (pred_y[i] - y[i])**2
        
    error /= len(y)
    return error

def runEpochs(x, y, epochsNb, stepSize):
    converged = False
    convergenceRate = 1e-5
    mses = []
    w = [1 for i in range(len(x[0]))]
    itr = 0
    while(not converged and itr < epochsNb): # sweeps
        w = epoch(x, y, w, stepSize)
        pred_y = [predictedY(x[i], w) for i in range(len(y))]
        mses.append(calcMse(pred_y, y))
        if (itr>1 and abs(mses[-1]-mses[-2])<convergenceRate):
            converged = True
        itr = itr+1
    return w, mses, itr

In [None]:
def shuffleData(randomSeed):
    data = cleanup()
    data = addOnes(data)
    train = data.sample(frac=0.8, random_state=randomSeed)
    test = data.drop(train.index)
    
    train = train.reset_index()
    test = test.reset_index()
    
    return [train, test]

sets = []
sets.append(shuffleData(100))
sets.append(shuffleData(125))
sets.append(shuffleData(150))
sets.append(shuffleData(175))
sets.append(shuffleData(200))
for i in range(sets):
    sets[i][0].to_csv("Datasets/Dataset_3_train_split"+str(i)+".csv")
    sets[i][1].to_csv("Datasets/Dataset_3_test_split"+str(i)+".csv")

testMsesAndWeights = []
for i in range(len(sets)):
    x = [row[:-1] for row in sets[i][0].as_matrix()] # list of rows
    y = [row[-1] for row in sets[i][0].as_matrix()] # list of output
    w, mses, epochsRan = runEpochs(x, y, 5000, 1e-3)
    print(epochsRan, mses[-1])
    
    pred_y = [predictedY(x[i], w) for i in range(len(y))]
    test_y = [row[-1] for row in sets[i][1].as_matrix()]
    testMse = calcMse(pred_y, test_y)
    testMsesAndWeights.append([testMse, w])

In [None]:
for testMseAndW in testMsesAndWeights:
    print(testMseAndW[0])