# Question 1

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
meanNegativeDS1 = pd.read_csv("Datasets/DS1_m_0.txt", header=None).drop([20], axis=1).as_matrix().flatten()
meanPositiveDS1 = pd.read_csv("Datasets/DS1_m_1.txt", header=None).drop([20], axis=1).as_matrix().flatten()
covDS1 = pd.read_csv("Datasets/DS1_Cov.txt", header=None).drop([20], axis=1).as_matrix()

In [7]:
dataNeg = np.random.multivariate_normal(meanNegativeDS1, covDS1, 2000)
dataPos = np.random.multivariate_normal(meanPositiveDS1, covDS1, 2000)

In [8]:
dataNegLabelled = np.append(dataNeg, np.zeros((2000, 1)), axis=1)
dataPosLabelled = np.append(dataPos, np.ones((2000, 1)), axis=1)

np.random.shuffle(dataNegLabelled)
np.random.shuffle(dataPosLabelled)

In [9]:
dataTrain = np.concatenate((dataNegLabelled[:1400], dataPosLabelled[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNegLabelled[1400:], dataPosLabelled[1400:]))

In [10]:
np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [65]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS1.csv")
pd.DataFrame(data=dataTest).to_csv("Datasets/DS1_test.csv")
pd.DataFrame(data=dataTrain).to_csv("Datasets/DS1_train.csv")

# Question 2

In [12]:
def splitNegPos(dataTrain):
    neg = []
    pos = []
    for row in dataTrain:
        if (row[20] == 0):
            neg.append(row)
        elif (row[20] == 1):
            pos.append(row)
        else:
            print "problem here"
    return np.asmatrix(neg), np.asmatrix(pos)

In [13]:
def calcMeanVectors(neg, pos):
    return np.mean(neg, axis=0).T, np.mean(pos, axis=0).T

In [14]:
def calcCovMatrix(neg, pos, negMean, posMean):
    negS = np.zeros((20,20))
    for row in neg:
        m = np.subtract(row, negMean.T)
        negS = negS + np.outer(m,m)
    
    posS = np.zeros((20,20))
    for row in pos:
        m = np.subtract(row, posMean.T)
        posS = posS + np.outer(m,m)
    
    return np.divide(np.add(negS, posS), (len(neg)+len(pos)))

In [15]:
# Compututation for training
neg, pos = splitNegPos(dataTrain)
negMean, posMean = calcMeanVectors(neg[:, :-1], pos[:, :-1])
print negMean
print posMean

[[1.2614427 ]
 [1.22836197]
 [1.20484808]
 [1.19626626]
 [1.23064767]
 [1.22655065]
 [1.22331304]
 [1.22607009]
 [1.21404533]
 [1.19369645]
 [1.28752386]
 [1.24790739]
 [1.21437172]
 [1.24192373]
 [1.20263893]
 [1.18126648]
 [1.22389339]
 [1.25489079]
 [1.27608961]
 [1.23021587]]
[[2.02015181]
 [1.94693069]
 [2.05893388]
 [2.048659  ]
 [2.02830185]
 [2.0332283 ]
 [1.96369367]
 [1.98964359]
 [2.03352687]
 [1.98035308]
 [1.9902823 ]
 [2.00264695]
 [2.09793453]
 [1.97610669]
 [2.0142644 ]
 [1.99950455]
 [2.02826798]
 [1.96814903]
 [1.95431869]
 [2.124221  ]]


In [16]:
cov = calcCovMatrix(neg[:, :-1], pos[:, :-1], negMean, posMean)

In [17]:
def trainLDA(negMean, posMean, cov, priorProb):
    w = np.dot(np.linalg.inv(cov), np.subtract(negMean, posMean))
    w0 = np.divide(np.dot(np.dot(negMean.T, np.linalg.inv(cov)), negMean), -2) + np.divide(np.dot(np.dot(posMean.T, np.linalg.inv(cov)), posMean), 2) + np.log(priorProb/(1-priorProb))
    return w, w0[0,0]

In [18]:
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[[ 14.26748566]
 [ -8.56458378]
 [ -5.2998211 ]
 [ -2.805305  ]
 [ -9.61078056]
 [ -4.24652139]
 [ 16.28326505]
 [-23.82975939]
 [-28.57535375]
 [  9.27593724]
 [-12.75239082]
 [-11.79404412]
 [ 14.92196022]
 [ 12.39637746]
 [ -5.49517254]
 [ 12.78870123]
 [ 28.54886293]
 [ -6.36933361]
 [ -0.11832129]
 [ -5.08081374]]
26.47968814971185


In [19]:
def sigmoid(a):
    return 1/(1+np.exp(-a))

In [20]:
def prediction(dataTest, w, w0):
    truePos, falsePos, falseNeg = 0, 0, 0
    for row in dataTest:
        label = row[-1]
        row = row[:-1]
        val = w0 + np.dot(w.T, row)
        if (sigmoid(val) < 0.5):
            if (label == 1):
                truePos += 1
            elif (label == 0):
                falsePos += 1
            else:
                print "problem here"
        else:
            if (label == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [21]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [64]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

def evaluateConfusion(truePos, falsePos, falseNeg):
    accuracy = (float(len(dataTest)-falsePos-falseNeg))/len(dataTest)
    precision = float(truePos)/(truePos+falsePos)
    recall = float(truePos)/(truePos+falseNeg)
    fMeasure = 2*precision*recall/(precision+recall)
    return accuracy, precision, recall, fMeasure

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

569
32
31
568
Accuracy 0.9475
Precision 0.946755407654
Recall 0.948333333333
fMeasure 0.947543713572


# Question 3

In [86]:
def kNN(k, dataTrain, index, distances):
    indices = np.argpartition(distances[index], k)[:k]
    closestLabel = [dataTrain[i, -1] for i in indices]
    return np.argmax(np.bincount(closestLabel))

In [87]:
def getPredictions(k, dataTrain, distances):
    return [kNN(k, dataTrain, row, distances) for row in distances]

In [88]:
def getConfusion(predictedLabels, trueLabels):
    truePos, falsePos, falseNeg = 0, 0, 0
    for i in range(len(predictedLabels)):
        if predictedLabels[i] == 1:
            if (trueLabels[i] == 1):
                truePos += 1
            else:
                falsePos += 1
        else:
            if (trueLabels[i] == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [89]:
def findBestK(dataTest, dataTrain, distances):
    f1 = []
    for k in range(1, 100):
        predictedLabels = getPredictions(k, dataTrain, distances)
        a, p, r, f = evaluateConfusion(getConfusion(predictedLabels, dataTest[:, -1]))
        f1.append(f)
    return np.argmax(f1)+1

In [103]:
distances = [[np.linalg.norm(np.subtract(x, row)) for row in dataTrain[:, :-1]] for x in dataTest[:, :-1]]

In [104]:
bestK = findBestK(dataTest, dataTrain, distances)
print bestK

TypeError: list indices must be integers, not list

In [None]:
predictedLabels = getPredictions(bestK, dataTest, dataTrain)
truePos, falsePos, falseNeg = getConfusion(predictedLabels, dataTest[:, -1])

print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

KeyboardInterrupt: 

In [102]:
distances[0]

[19.671512663301186,
 18.53995179175899,
 11.248133104030925,
 13.29332439367692,
 5.485732563610275,
 11.747354314798569,
 7.958228108884618,
 14.820688905989739,
 11.946120050051597,
 4.8742894011943205,
 14.942565507104767,
 8.603806443721709,
 9.372449859456907,
 27.35856368198283,
 7.905371654279479,
 7.856349698041809,
 7.179660790611796,
 21.84227338752726,
 16.96136522418242,
 32.886869034475495,
 8.862401056950603,
 11.859793016627684,
 15.108687251997456,
 15.700330738545665,
 10.467052085799853,
 8.072162148614604,
 12.43832870956614,
 9.902510188198532,
 5.085781053070923,
 14.190753125476997,
 18.479372215609672,
 10.930208400832443,
 9.831445077688675,
 8.097474119298205,
 9.754791737044961,
 8.255198442388904,
 6.054567327979866,
 28.946068202867465,
 11.64414202524881,
 22.80294678313621,
 32.41043380117874,
 5.588542929896,
 11.669553987780253,
 11.498939414515958,
 14.033867385036666,
 8.694986516939393,
 8.390452428330518,
 12.194659449132443,
 8.589460806778295,
 29