# Question 1

In [367]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [368]:
meanNegativeDS1 = pd.read_csv("Datasets/DS1_m_0.txt", header=None).drop([20], axis=1).as_matrix().flatten()
meanPositiveDS1 = pd.read_csv("Datasets/DS1_m_1.txt", header=None).drop([20], axis=1).as_matrix().flatten()
covDS1 = pd.read_csv("Datasets/DS1_Cov.txt", header=None).drop([20], axis=1).as_matrix()

In [369]:
dataNeg = np.random.multivariate_normal(meanNegativeDS1, covDS1, 2000)
dataPos = np.random.multivariate_normal(meanPositiveDS1, covDS1, 2000)

In [370]:
dataNegLabelled = np.append(dataNeg, np.zeros((2000, 1)), axis=1)
dataPosLabelled = np.append(dataPos, np.ones((2000, 1)), axis=1)

np.random.shuffle(dataNegLabelled)
np.random.shuffle(dataPosLabelled)

In [371]:
dataTrain = np.concatenate((dataNegLabelled[:1400], dataPosLabelled[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNegLabelled[1400:], dataPosLabelled[1400:]))

In [372]:
np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [373]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS1.csv")

# Question 2

In [374]:
def splitNegPos(dataTrain):
    neg = []
    pos = []
    for row in dataTrain:
        if (row[20] == 0):
            neg.append(row)
        elif (row[20] == 1):
            pos.append(row)
        else:
            print "problem here"
    return np.asmatrix(neg), np.asmatrix(pos)

In [375]:
def calcMeanVectors(neg, pos):
    return np.mean(neg, axis=0).T, np.mean(pos, axis=0).T

In [376]:
def calcCovMatrix(neg, pos, negMean, posMean):
    negS = np.zeros((20,20))
    for row in neg:
        m = np.subtract(row, negMean.T)
        negS = negS + np.outer(m,m)
    
    posS = np.zeros((20,20))
    for row in pos:
        m = np.subtract(row, posMean.T)
        posS = posS + np.outer(m,m)
    
    return np.divide(np.add(negS, posS), (len(neg)+len(pos)))

In [377]:
# Compututation for training
neg, pos = splitNegPos(dataTrain)
negMean, posMean = calcMeanVectors(neg[:, :-1], pos[:, :-1])
print negMean
print posMean

[[1.31916849]
 [1.30853257]
 [1.33174387]
 [1.3483205 ]
 [1.35201642]
 [1.30836494]
 [1.35246805]
 [1.39879473]
 [1.31133484]
 [1.39600972]
 [1.34161014]
 [1.35252831]
 [1.42742351]
 [1.36792756]
 [1.32101946]
 [1.32451578]
 [1.31137938]
 [1.3427056 ]
 [1.38906403]
 [1.35067034]]
[[2.00872173]
 [2.06309333]
 [2.03663006]
 [2.07240087]
 [2.04642643]
 [2.03388472]
 [2.02019483]
 [2.00964887]
 [1.99295291]
 [2.05032542]
 [2.02250818]
 [2.02547634]
 [2.04431684]
 [1.99736076]
 [2.00977926]
 [2.00570895]
 [2.03473557]
 [2.01085793]
 [1.9816536 ]
 [2.04454743]]


In [379]:
cov = calcCovMatrix(neg[:, :-1], pos[:, :-1], negMean, posMean)

In [380]:
def trainLDA(negMean, posMean, cov, priorProb):
    w = np.dot(np.linalg.inv(cov), np.subtract(negMean, posMean))
    w0 = np.divide(np.dot(np.dot(negMean.T, np.linalg.inv(cov)), negMean), -2) + np.divide(np.dot(np.dot(posMean.T, np.linalg.inv(cov)), posMean), 2) + np.log(priorProb/(1-priorProb))
    return w, w0[0,0]

In [381]:
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[[ 14.93675871]
 [ -8.86931375]
 [ -5.48121307]
 [ -3.1654574 ]
 [ -9.833837  ]
 [ -4.71443446]
 [ 17.19991687]
 [-24.75762283]
 [-29.83742153]
 [  9.63363883]
 [-13.58011467]
 [-12.60408564]
 [ 15.71441283]
 [ 12.89633034]
 [ -5.85897691]
 [ 13.444109  ]
 [ 29.98599611]
 [ -6.78883467]
 [ -0.12883234]
 [ -5.13971329]]
27.89665885551857


In [382]:
def sigmoid(a):
    return 1/(1+np.exp(-a))

In [386]:
def prediction(dataTest, w, w0):
    truePos, falsePos, falseNeg = 0, 0, 0
    for row in dataTest:
        label = row[-1]
        row = row[:-1]
        val = w0 + np.dot(w.T, row)
        if (sigmoid(val) < 0.5):
            if (label == 1):
                truePos += 1
            elif (label == 0):
                falsePos += 1
            else:
                print "problem here"
        else:
            if (label == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [387]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [388]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy = (float(len(dataTest)-falsePos-falseNeg))/len(dataTest)
precision = float(truePos)/(truePos+falsePos)
recall = float(truePos)/(truePos+falseNeg)
print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
fMeasure = 2*precision*recall/(precision+recall)
print "fMeasure " + str(fMeasure)

581
23
19
577
Accuracy 0.965
Precision 0.961920529801
Recall 0.968333333333
fMeasure 0.96511627907
