In [181]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [182]:
fileName = "Datasets/Dataset_3.csv"
def cleanup():
    data = pd.read_csv(fileName, header=None)
    # Remove first 5 columns as they are non-predictive
    data = data.drop(range(5), axis=1)
    
    naColIndex = [col for col in data.columns[data.isnull().any()]] # list of column indexes with missing values
    # Filling missing values with random values between (mean-std, mean+std) because simply filling it with the mean would have too many similar values and would create skew
    for col in naColIndex:
        mean = data[col].mean()
        std = data[col].std()
        if (mean-std < 0):
            randomList = np.random.uniform(0, mean+std, size=data[col].isnull().sum()) # values are all positive
        else:
            randomList = np.random.uniform(mean-std, mean+std, size=data[col].isnull().sum())

        data[col][np.isnan(data[col])] = randomList
    
    return data

In [218]:
data = cleanup()
data.to_csv("Datasets/Dataset_3_complete.csv") # Turning in complete data set

def addOnes(data):
    data.insert(0,'',1)
    data.column = [np.arange(0, data.shape[1])]
    return data

In [194]:
# Using functions from Q2 and slightly modifying them

def predictedY(row, weights):
    return sum([row[i]*w for i,w in enumerate(weights)])

def updateWeights(row, y, w, a): # a is learning rate
    pred_y = predictedY(row, w)
    w = [wi - a*(pred_y-y)*row[i] for i,wi in enumerate(w)]
    return w

def epoch(x, y, w, a):
    for i in range(len(x)):
        w = updateWeights(x[i], y[i], w, a) # Issue here with weights diverging to NaN unless step size <= 1e-6
    return w

def calcMse(pred_y, y):
    error = 0
    for i in range(len(y)):
        error += (pred_y[i] - y[i])**2
        
    error /= len(y)
    return error

def runEpochs(x, y, epochsNb, stepSize):
    converged = False
    convergenceRate = 1e-5
    w = [1 for i in range(len(x[0]))]
    currentMse = 10000
    lastMse = 0
    itr = 0
    while(not converged and itr < epochsNb): # sweeps
        w = epoch(x, y, w, stepSize)
        pred_y = [predictedY(x[i], w) for i in range(len(y))]
        currentMse = calcMse(pred_y, y)
        if (itr>1 and abs(currentMse-lastMse)<convergenceRate):
            converged = True
        itr = itr+1
        lastMse = currentMse
    return w, currentMse, itr

In [224]:
def shuffleData(randomSeed):
    data = cleanup()
    data = addOnes(data)
    train = data.sample(frac=0.8, random_state=randomSeed)
    test = data.drop(train.index)
    
    train = train.reset_index()
    test = test.reset_index()
    
    return [train, test]

sets = []
sets.append(shuffleData(100))
sets.append(shuffleData(125))
sets.append(shuffleData(150))
sets.append(shuffleData(175))
sets.append(shuffleData(200))
for i in range(len(sets)):
    sets[i][0].to_csv("Datasets/Dataset_3_train_split"+str(i)+".csv")
    sets[i][1].to_csv("Datasets/Dataset_3_test_split"+str(i)+".csv")

testMsesAndWeights = []
for i in range(len(sets)):
    x = [row[:-1] for row in sets[i][0].as_matrix()] # list of rows
    y = [row[-1] for row in sets[i][0].as_matrix()] # list of output
    w, mse, epochsRan = runEpochs(x, y, 5, 1e-9)
    print(epochsRan, mse)
    
    x = [row[:-1] for row in sets[i][1].as_matrix()]
    test_y = [row[-1] for row in sets[i][1].as_matrix()]
    pred_y = [predictedY(x[i], w) for i in range(len(test_y))]
    
    testMse = calcMse(pred_y, test_y)
    testMsesAndWeights.append([testMse, w])

5 508.993850897
5 517.549237407
5 496.784667266
5 516.02825298
5 506.056626092


In [225]:
for testMseAndW in testMsesAndWeights:
    print("Test MSE:")
    print(testMseAndW[0])
    print("Weight params:")
    print(testMseAndW[1])
    print()
    
print("Average MSE:")
print(sum([testMseAndW[0] for testMseAndW in testMsesAndWeights])/len(testMsesAndWeights))

Test MSE:
511.382143253
Weight params:
[-0.033370573412133203, 0.99913848987471543, 0.99994713334560281, 0.999596004699422, 0.9998386883756315, 0.99936095512451417, 0.99985989956579813, 0.99986965297619324, 0.99963622343690306, 0.99957540034443904, 0.99971139253352903, 0.99963786607938865, 0.99994118035316804, 0.99938974764438826, 0.99968260066958992, 0.99951557697973203, 0.99974864136331631, 0.99957147555930559, 0.99959731354528469, 0.99972542090675498, 0.99958724227265527, 0.99967183044787511, 0.99969366559209127, 0.99967718648060944, 0.99974214993284261, 0.99982199829236629, 0.9997137118608963, 0.99975043083167914, 0.99966706095137947, 0.99994906102637882, 0.99974210506521943, 0.99972640217232822, 0.99966975778664569, 0.99968523485720984, 0.99968508961284963, 0.99956599727508155, 0.99966305968328495, 0.99961971758253576, 0.99966963542222576, 0.99961587135601015, 0.99960653258741905, 0.99962428762690969, 0.99957987904408707, 0.99957607392350423, 0.99957566261073094, 0.999473414502582