# Question 1

In [220]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [221]:
meanNegativeDS1 = pd.read_csv("Datasets/DS1_m_0.txt", header=None).drop([20], axis=1).as_matrix().flatten()
meanPositiveDS1 = pd.read_csv("Datasets/DS1_m_1.txt", header=None).drop([20], axis=1).as_matrix().flatten()
covDS1 = pd.read_csv("Datasets/DS1_Cov.txt", header=None).drop([20], axis=1).as_matrix()

In [222]:
dataNeg = np.random.multivariate_normal(meanNegativeDS1, covDS1, 2000)
dataPos = np.random.multivariate_normal(meanPositiveDS1, covDS1, 2000)

In [223]:
dataNegLabelled = np.append(dataNeg, np.zeros((2000, 1)), axis=1)
dataPosLabelled = np.append(dataPos, np.ones((2000, 1)), axis=1)

In [224]:
dataTrain = np.concatenate((dataNegLabelled[:1400], dataPosLabelled[:1400])) # 1400 = 70% of 2000
dataTest = np.concatenate((dataNegLabelled[1400:], dataPosLabelled[1400:]))

In [225]:
dataTrainNeg = dataTrain[:1400, :-1]
dataTrainPos = dataTrain[1400:, :-1]

np.random.shuffle(dataTrain)
np.random.shuffle(dataTest)

In [226]:
# save dataset
dataTestTrain = np.concatenate((dataTrain, dataTest))
pd.DataFrame(data=dataTestTrain).to_csv("Datasets/DS1.csv")

# Question 2

In [227]:
def calcMeanVectors(neg, pos):
    return np.mean(dataTrainNeg, axis=0), np.mean(dataTrainPos, axis=0)

In [228]:
def calcCovMatrix(neg, pos, negMean, posMean):
    negS = np.zeros((20,20))
    for row in neg:
        m = np.subtract(row, negMean)
        negS = negS + np.outer(m,m)
    
    posS = np.zeros((20,20))
    for row in pos:
        m = np.subtract(row, posMean)
        posS = posS + np.outer(m,m)
    
    return np.divide(np.add(negS, posS), (len(neg)+len(pos)))

In [229]:
def trainLDA(negMean, posMean, cov, priorProb):
    w = np.dot(np.linalg.inv(cov), np.subtract(negMean, posMean))
    w0 = np.divide(np.dot(np.dot(negMean.T, np.linalg.inv(cov)), negMean), -2) + np.divide(np.dot(np.dot(posMean.T, np.linalg.inv(cov)), posMean), 2) + np.log(priorProb/(1-priorProb))
    return w, w0

In [230]:
# Compututation for training
negMean, posMean = calcMeanVectors(dataTrainNeg, dataTrainPos)
cov = calcCovMatrix(dataTrainNeg, dataTrainPos, negMean, posMean)
w, w0 = trainLDA(negMean, posMean, cov, 0.5) # Prior probability for negative class and positive class is 0.5
print w
print w0

[-0.03810805 -0.00851562 -0.06887418  0.11082698 -0.09479853  0.04853718
  0.05714717  0.05075715  0.0290923   0.04355267  0.06962879 -0.1308975
 -0.05653003 -0.07335942  0.00302298 -0.05494477  0.05740252  0.04451387
  0.08225349  0.00882607]
-0.12803078964177672


In [231]:
def prediction(dataTest, w, w0):
    truePos, falsePos, falseNeg = 0, 0, 0
    for row in dataTest:
        posLabel = row[-1]
        row = row[:-1]
        val = w0 + np.dot(w, row)
        if (val >= 0.5):
            if (posLabel):
                truePos += 1
            else:
                falsePos += 1
        else:
            if (posLabel):
                falseNeg += 1
                
    return truePos, falsePos, falseNeg

In [232]:
truePos, falsePos, falseNeg = prediction(dataTest, w, w0)

In [234]:
print truePos
print falsePos
print falseNeg
print (len(dataTest)-falsePos-falseNeg-truePos) # trueNeg

accuracy = (float(len(dataTest)-falsePos-falseNeg))/len(dataTest)
precision = float(truePos)/(truePos+falsePos)
recall = float(truePos)/(truePos+falseNeg)
print accuracy
print precision
print recall
fMeasure = 2*precision*recall/(precision+recall)
print fMeasure

5
1
595
599
0.503333333333
0.833333333333
0.00833333333333
0.016501650165
