# Question 1

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
meanNegativeDS1 = pd.read_csv("Datasets/DS1_m_0.txt", header=None).drop([20], axis=1).as_matrix().flatten()
meanPositiveDS1 = pd.read_csv("Datasets/DS1_m_1.txt", header=None).drop([20], axis=1).as_matrix().flatten()
covDS1 = pd.read_csv("Datasets/DS1_Cov.txt", header=None).drop([20], axis=1).as_matrix()

In [7]:
dataNeg = np.random.multivariate_normal(meanNegativeDS1, covDS1, 2000)
dataPos = np.random.multivariate_normal(meanPositiveDS1, covDS1, 2000)

In [8]:
dataNegLabelled = np.append(dataNeg, np.zeros((2000, 1)), axis=1)
dataPosLabelled = np.append(dataPos, np.ones((2000, 1)), axis=1)

np.random.shuffle(dataNegLabelled)
np.random.shuffle(dataPosLabelled)

In [9]:
dataTrain = np.concatenate((dataNegLabelled[:1400], dataPosLabelled[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNegLabelled[1400:], dataPosLabelled[1400:]))

In [10]:
np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [65]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS1.csv")
pd.DataFrame(data=dataTest).to_csv("Datasets/DS1_test.csv")
pd.DataFrame(data=dataTrain).to_csv("Datasets/DS1_train.csv")

# Question 2

In [12]:
def splitNegPos(dataTrain):
    neg = []
    pos = []
    for row in dataTrain:
        if (row[20] == 0):
            neg.append(row)
        elif (row[20] == 1):
            pos.append(row)
        else:
            print "problem here"
    return np.asmatrix(neg), np.asmatrix(pos)

In [13]:
def calcMeanVectors(neg, pos):
    return np.mean(neg, axis=0).T, np.mean(pos, axis=0).T

In [14]:
def calcCovMatrix(neg, pos, negMean, posMean):
    negS = np.zeros((20,20))
    for row in neg:
        m = np.subtract(row, negMean.T)
        negS = negS + np.outer(m,m)
    
    posS = np.zeros((20,20))
    for row in pos:
        m = np.subtract(row, posMean.T)
        posS = posS + np.outer(m,m)
    
    return np.divide(np.add(negS, posS), (len(neg)+len(pos)))

In [15]:
# Compututation for training
neg, pos = splitNegPos(dataTrain)
negMean, posMean = calcMeanVectors(neg[:, :-1], pos[:, :-1])
print negMean
print posMean

[[1.2614427 ]
 [1.22836197]
 [1.20484808]
 [1.19626626]
 [1.23064767]
 [1.22655065]
 [1.22331304]
 [1.22607009]
 [1.21404533]
 [1.19369645]
 [1.28752386]
 [1.24790739]
 [1.21437172]
 [1.24192373]
 [1.20263893]
 [1.18126648]
 [1.22389339]
 [1.25489079]
 [1.27608961]
 [1.23021587]]
[[2.02015181]
 [1.94693069]
 [2.05893388]
 [2.048659  ]
 [2.02830185]
 [2.0332283 ]
 [1.96369367]
 [1.98964359]
 [2.03352687]
 [1.98035308]
 [1.9902823 ]
 [2.00264695]
 [2.09793453]
 [1.97610669]
 [2.0142644 ]
 [1.99950455]
 [2.02826798]
 [1.96814903]
 [1.95431869]
 [2.124221  ]]


In [16]:
cov = calcCovMatrix(neg[:, :-1], pos[:, :-1], negMean, posMean)

In [17]:
def trainLDA(negMean, posMean, cov, priorProb):
    w = np.dot(np.linalg.inv(cov), np.subtract(negMean, posMean))
    w0 = np.divide(np.dot(np.dot(negMean.T, np.linalg.inv(cov)), negMean), -2) + np.divide(np.dot(np.dot(posMean.T, np.linalg.inv(cov)), posMean), 2) + np.log(priorProb/(1-priorProb))
    return w, w0[0,0]

In [18]:
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[[ 14.26748566]
 [ -8.56458378]
 [ -5.2998211 ]
 [ -2.805305  ]
 [ -9.61078056]
 [ -4.24652139]
 [ 16.28326505]
 [-23.82975939]
 [-28.57535375]
 [  9.27593724]
 [-12.75239082]
 [-11.79404412]
 [ 14.92196022]
 [ 12.39637746]
 [ -5.49517254]
 [ 12.78870123]
 [ 28.54886293]
 [ -6.36933361]
 [ -0.11832129]
 [ -5.08081374]]
26.47968814971185


In [19]:
def sigmoid(a):
    return 1/(1+np.exp(-a))

In [20]:
def prediction(dataTest, w, w0):
    truePos, falsePos, falseNeg = 0, 0, 0
    for row in dataTest:
        label = row[-1]
        row = row[:-1]
        val = w0 + np.dot(w.T, row)
        if (sigmoid(val) < 0.5):
            if (label == 1):
                truePos += 1
            elif (label == 0):
                falsePos += 1
            else:
                print "problem here"
        else:
            if (label == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [21]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [64]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

def evaluateConfusion(truePos, falsePos, falseNeg):
    accuracy = (float(len(dataTest)-falsePos-falseNeg))/len(dataTest)
    precision = float(truePos)/(truePos+falsePos)
    recall = float(truePos)/(truePos+falseNeg)
    fMeasure = 2*precision*recall/(precision+recall)
    return accuracy, precision, recall, fMeasure

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

569
32
31
568
Accuracy 0.9475
Precision 0.946755407654
Recall 0.948333333333
fMeasure 0.947543713572


# Question 3

In [111]:
def kNN(k, dataTrain, index, distances):
    indices = np.argpartition(distances[index], k)[:k]
    closestLabel = [dataTrain[i, -1] for i in indices]
    return np.argmax(np.bincount(closestLabel))

In [112]:
def getPredictions(k, dataTest, dataTrain, distances):
    return [kNN(k, dataTrain, i, distances) for i in range(len(dataTest))]

In [113]:
def getConfusion(predictedLabels, trueLabels):
    truePos, falsePos, falseNeg = 0, 0, 0
    for i in range(len(predictedLabels)):
        if predictedLabels[i] == 1:
            if (trueLabels[i] == 1):
                truePos += 1
            else:
                falsePos += 1
        else:
            if (trueLabels[i] == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [117]:
def findBestK(dataTest, dataTrain, distances):
    f1 = []
    for k in range(1, 100):
        predictedLabels = getPredictions(k, dataTest, dataTrain, distances)
        tp, fp, fn = getConfusion(predictedLabels, dataTest[:, -1])
        a, p, r, f = evaluateConfusion(tp, fp, fn)
        f1.append(f)
    return np.argmax(f1)+1

In [118]:
distances = [[np.linalg.norm(np.subtract(x, row)) for row in dataTrain[:, :-1]] for x in dataTest[:, :-1]]

In [119]:
bestK = findBestK(dataTest, dataTrain, distances)
print bestK

65


In [120]:
predictedLabels = getPredictions(bestK, dataTest, dataTrain, distances)
truePos, falsePos, falseNeg = getConfusion(predictedLabels, dataTest[:, -1])

print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

379
267
221
333
Accuracy 0.593333333333
Precision 0.586687306502
Recall 0.631666666667
fMeasure 0.60834670947


# Question 4

In [149]:
mean1PosDS2 = pd.read_csv('Datasets/DS2_c1_m1.txt', header = None).drop([20], axis=1).as_matrix().flatten()
mean2PosDS2 = pd.read_csv('Datasets/DS2_c1_m2.txt', header = None).drop([20], axis=1).as_matrix().flatten()
mean3PosDS2 = pd.read_csv('Datasets/DS2_c1_m3.txt', header = None).drop([20], axis=1).as_matrix().flatten()

mean1NegDS2 = pd.read_csv('Datasets/DS2_c2_m1.txt', header = None).drop([20], axis=1).as_matrix().flatten()
mean2NegDS2 = pd.read_csv('Datasets/DS2_c2_m2.txt', header = None).drop([20], axis=1).as_matrix().flatten()
mean3NegDS2 = pd.read_csv('Datasets/DS2_c2_m3.txt', header = None).drop([20], axis=1).as_matrix().flatten()

cov1 = pd.read_csv("Datasets/DS2_Cov1.txt", header=None).drop([20], axis=1).as_matrix()
cov2 = pd.read_csv("Datasets/DS2_Cov2.txt", header=None).drop([20], axis=1).as_matrix()
cov3 = pd.read_csv("Datasets/DS2_Cov3.txt", header=None).drop([20], axis=1).as_matrix()

In [150]:
dataNeg1 = pd.DataFrame(data = np.random.multivariate_normal(mean1NegDS2, cov1, 2000)).sample(frac=0.1)
dataNeg2 = pd.DataFrame(data = np.random.multivariate_normal(mean2NegDS2, cov2, 2000)).sample(frac=0.42)
dataNeg3 = pd.DataFrame(data = np.random.multivariate_normal(mean3NegDS2, cov3, 2000)).sample(frac=0.48)

dataNeg = np.concatenate((np.concatenate((np.array(dataNeg1), np.array(dataNeg2))), np.array(dataNeg3)))

dataPos1 = pd.DataFrame(data = np.random.multivariate_normal(mean1PosDS2, cov1, 2000)).sample(frac=0.1)
dataPos2 = pd.DataFrame(data = np.random.multivariate_normal(mean2PosDS2, cov2, 2000)).sample(frac=0.42)
dataPos3 = pd.DataFrame(data = np.random.multivariate_normal(mean3PosDS2, cov3, 2000)).sample(frac=0.48)

dataPos = np.concatenate((np.concatenate((np.array(dataPos1), np.array(dataPos2))), np.array(dataPos3)))

In [151]:
dataNeg = np.append(dataNeg,np.zeros((2000,1)), axis=1)
dataPos = np.append(dataPos,np.ones((2000,1)), axis=1)

np.random.shuffle(dataNeg)
np.random.shuffle(dataPos)

In [152]:
dataTrain = np.concatenate((dataNeg[:1400], dataPos[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNeg[1400:], dataPos[1400:]))

np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [153]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS2.csv")
pd.DataFrame(data=dataTest).to_csv("Datasets/DS2_test.csv")
pd.DataFrame(data=dataTrain).to_csv("Datasets/DS2_train.csv")

# Question 5.1

In [154]:
# Compututation for training
neg, pos = splitNegPos(dataTrain)
negMean, posMean = calcMeanVectors(neg[:, :-1], pos[:, :-1])
print negMean
print posMean

[[1.23159085]
 [1.21440199]
 [1.22689487]
 [1.25150563]
 [1.22008269]
 [1.20869373]
 [1.23334347]
 [1.22760068]
 [1.23644895]
 [1.14081003]
 [1.21692361]
 [1.26056693]
 [1.15851963]
 [1.20855883]
 [1.20449102]
 [1.21191718]
 [1.22553093]
 [1.2748046 ]
 [1.20158003]
 [1.28470286]]
[[0.90399085]
 [0.92570532]
 [0.90304828]
 [0.90641961]
 [0.93223767]
 [0.94906293]
 [0.95772628]
 [0.90850007]
 [0.96578773]
 [0.89749443]
 [0.89876744]
 [0.88778945]
 [0.94711165]
 [0.92193447]
 [0.95454802]
 [0.87710736]
 [0.84973135]
 [0.93040217]
 [0.87877994]
 [0.96662294]]


In [155]:
cov = calcCovMatrix(neg[:, :-1], pos[:, :-1], negMean, posMean)

In [156]:
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[[-0.00946916]
 [-0.01071975]
 [ 0.00348337]
 [ 0.04809125]
 [ 0.04067397]
 [-0.01343135]
 [-0.07591526]
 [-0.0011989 ]
 [ 0.0334194 ]
 [-0.0148552 ]
 [ 0.03488377]
 [ 0.03138586]
 [-0.05492472]
 [ 0.00450722]
 [-0.04654164]
 [ 0.02846735]
 [ 0.05301648]
 [ 0.0214602 ]
 [ 0.02288127]
 [-0.01627267]]
-0.08174814948270964


In [157]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [158]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

304
287
296
313
Accuracy 0.514166666667
Precision 0.514382402707
Recall 0.506666666667
fMeasure 0.510495382032


# Question 5.2

In [159]:
distances = [[np.linalg.norm(np.subtract(x, row)) for row in dataTrain[:, :-1]] for x in dataTest[:, :-1]]

In [160]:
bestK = findBestK(dataTest, dataTrain, distances)
print bestK

29


In [161]:
predictedLabels = getPredictions(bestK, dataTest, dataTrain, distances)
truePos, falsePos, falseNeg = getConfusion(predictedLabels, dataTest[:, -1])

print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy, precision, recall, fMeasure = evaluateConfusion(truePos, falsePos, falseNeg)

print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
print "fMeasure " + str(fMeasure)

326
287
274
313
Accuracy 0.5325
Precision 0.531810766721
Recall 0.543333333333
fMeasure 0.537510305029
