# Question 1

In [282]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [283]:
meanNegativeDS1 = pd.read_csv("Datasets/DS1_m_0.txt", header=None).drop([20], axis=1).as_matrix().flatten()
meanPositiveDS1 = pd.read_csv("Datasets/DS1_m_1.txt", header=None).drop([20], axis=1).as_matrix().flatten()
covDS1 = pd.read_csv("Datasets/DS1_Cov.txt", header=None).drop([20], axis=1).as_matrix()

In [284]:
dataNeg = np.random.multivariate_normal(meanNegativeDS1, covDS1, 2000)
dataPos = np.random.multivariate_normal(meanPositiveDS1, covDS1, 2000)

In [285]:
dataNegLabelled = np.append(dataNeg, np.zeros((2000, 1)), axis=1)
dataPosLabelled = np.append(dataPos, np.ones((2000, 1)), axis=1)

np.random.shuffle(dataNegLabelled)
np.random.shuffle(dataPosLabelled)

In [286]:
dataTrain = np.concatenate((dataNegLabelled[:1400], dataPosLabelled[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNegLabelled[1400:], dataPosLabelled[1400:]))

In [287]:
np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [288]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS1.csv")

# Question 2

In [289]:
def splitNegPos(dataTrain):
    neg = []
    pos = []
    for row in dataTrain:
        if (row[20] == 0):
            neg.append(row)
        elif (row[20] == 1):
            pos.append(row)
        else:
            print "problem here"
    return np.asmatrix(neg), np.asmatrix(pos)

In [290]:
def calcMeanVectors(neg, pos):
    return np.mean(neg, axis=0).T, np.mean(pos, axis=0).T

In [313]:
def calcCovMatrix(neg, pos, negMean, posMean):
    negS = np.zeros((20,20))
    for row in neg:
        m = np.subtract(row, negMean.T)
        negS = negS + np.outer(m,m)
    
    posS = np.zeros((20,20))
    for row in pos:
        m = np.subtract(row, posMean.T)
        posS = posS + np.outer(m,m)
    
    return np.divide(np.add(negS, posS), (len(neg)+len(pos)))

In [314]:
# Compututation for training
neg, pos = splitNegPos(dataTrain)
negMean, posMean = calcMeanVectors(neg[:, :-1], pos[:, :-1])
print negMean
print posMean

[[1.33126829]
 [1.38596144]
 [1.37539477]
 [1.31716526]
 [1.35736297]
 [1.34953626]
 [1.35501262]
 [1.27776788]
 [1.37365803]
 [1.31589585]
 [1.29469983]
 [1.36015141]
 [1.31206035]
 [1.33626164]
 [1.33021523]
 [1.28602542]
 [1.39002543]
 [1.34643201]
 [1.32058212]
 [1.2614043 ]]
[[1.994739  ]
 [2.00537787]
 [1.96970676]
 [2.0606416 ]
 [2.03666841]
 [1.97490732]
 [2.04122152]
 [2.0479414 ]
 [2.01163251]
 [2.07168997]
 [1.98983422]
 [2.06960246]
 [2.06917206]
 [2.08131506]
 [2.00150679]
 [2.00959351]
 [1.98966016]
 [2.07065483]
 [2.04076271]
 [2.04086404]]


In [315]:
cov = calcCovMatrix(neg[:, :-1], pos[:, :-1], negMean, posMean)
print cov

[[7.78425269 5.54716073 6.02641399 5.1526124  5.819713   6.05297804
  4.60649086 5.35610719 4.89258653 5.1875738  3.88036486 5.26272869
  7.10883653 5.87596609 5.93848204 5.89895999 5.78817795 5.65446781
  5.53482787 5.94967779]
 [5.54716073 6.94981781 5.39073072 4.45950202 5.46046707 5.58895395
  4.357526   4.05295184 4.13172002 5.10178635 3.34551902 4.71158226
  6.06617693 5.1021412  5.42167477 5.29934646 5.65360031 5.18659246
  5.36368285 5.43604445]
 [6.02641399 5.39073072 7.16874916 4.87058432 5.69030842 6.48783799
  4.53258319 4.76587927 4.77023318 5.06903775 3.19696831 4.64095439
  6.3303123  5.07546101 5.94148376 5.92796667 6.1454117  4.97700805
  4.61196503 5.06604189]
 [5.1526124  4.45950202 4.87058432 5.76249551 5.28192282 4.4972729
  3.71640352 4.41584601 3.3687244  4.29157981 2.74153069 4.226079
  5.92633061 4.7787965  4.73446338 5.11227162 4.69930183 4.64773785
  3.92977004 5.75494878]
 [5.819713   5.46046707 5.69030842 5.28192282 6.97783321 5.32109263
  4.97255845 4.4323

In [322]:
def trainLDA(negMean, posMean, cov, priorProb):
    w = np.dot(np.linalg.inv(cov), np.subtract(negMean, posMean))
    w0 = np.divide(np.dot(np.dot(negMean.T, np.linalg.inv(cov)), negMean), -2) + np.divide(np.dot(np.dot(posMean.T, np.linalg.inv(cov)), posMean), 2) + np.log(priorProb/(1-priorProb))
    return w, w0[0][0]

In [323]:
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[[ 14.24602477]
 [ -8.35235933]
 [ -4.77546836]
 [ -2.92193936]
 [ -9.4400175 ]
 [ -4.68824974]
 [ 15.86315722]
 [-23.7404523 ]
 [-28.3945684 ]
 [  9.33237041]
 [-12.9128893 ]
 [-11.29811389]
 [ 14.85657967]
 [ 12.37991497]
 [ -5.59126103]
 [ 12.85365609]
 [ 28.38231065]
 [ -6.85586918]
 [ -0.16023563]
 [ -4.96191482]]
[[26.65850261]]


In [324]:
def sigmoid(a):
    return 1/(1+np.exp(-a))

In [325]:
def prediction(dataTest, w, w0):
    truePos, falsePos, falseNeg = 0, 0, 0
    for row in dataTest:
        posLabel = row[-1]
        row = row[:-1]
        val = w0 + np.dot(w.T, row)
        if (sigmoid(val) >= 0.5):
            if (posLabel == 1):
                truePos += 1
            elif (posLabel == 0):
                falsePos += 1
            else:
                print "problem here"
        else:
            if (posLabel == 1):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [326]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [327]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy = (float(len(dataTest)-falsePos-falseNeg))/len(dataTest)
precision = float(truePos)/(truePos+falsePos)
recall = float(truePos)/(truePos+falseNeg)
print "Accuracy " + str(accuracy)
print "Precision " + str(precision)
print "Recall " + str(recall)
fMeasure = 2*precision*recall/(precision+recall)
print "fMeasure " + str(fMeasure)

25
566
575
34
Accuracy 0.0491666666667
Precision 0.0423011844332
Recall 0.0416666666667
fMeasure 0.0419815281276
