# COMP 551: Applied Machine Learning
## Assignment 2: Linear  Classification  and  Nearest  Neighbor  Classification
### Author: Antonios Valkanas

In [1]:
# Import some useful libraries
import numpy as np
import pandas as pd
import operator
%matplotlib inline

### Question 1: 
#### You will use a synthetic data set for the classification task that you’ll generate yourself. Generate two classes with 20 features each. Each class is given by a multivariate Gaussian distribution,  with both classes sharing the same covariance matrix.  You are provided with the mean vectors (DS1-m0 for mean vector of negative class and DS1-m1 for mean vector of positive class) and the covariance matrix (DS1-cov).  Generate 2000 examples for each class,  and label the data to be positive if they came from the Gaussian with mean m1 and negative if they came from the Gaussian with mean m0.  Randomly pick (without replacement) 20% of each class (i.e., 400 data points per class) as test set, 20% of each class (i.e., 400 data points per class) as validation set set and train the classifiers on the remaining 60% data.  When you report performance results, it should be on the test set.  Call this dataset as DS1, and submit it with your code.  Follow the instructions from Assignment 1 for data submission format.


In [2]:
# Read in the mean data from file.
mean_0 = np.loadtxt("Datasets/DS1_m_0.txt", delimiter = ',', usecols=range(20))
mean_1 = np.loadtxt("Datasets/DS1_m_1.txt", delimiter = ',', usecols=range(20))
cov = np.loadtxt("Datasets/DS1_Cov.txt", delimiter = ',', usecols= range(20))

# Generate the multivariate gaussian distributions and sample each 2000 times.
negative_class = np.random.multivariate_normal(mean_0,cov,2000)
positive_class = np.random.multivariate_normal(mean_1,cov,2000)

# Add labels to the data (add a 0 or a 1 on the 21-st column for labeling)
negative_class = np.insert(negative_class, 20, 0,axis=1)
positive_class = np.insert(positive_class, 20, 1,axis=1)

# Create train, validation and test sets from both sample distributions.
test = np.append(negative_class[:400], positive_class[:400],axis=0)
validation = np.append(negative_class[400:800], positive_class[400:800],axis=0)
train = np.append(negative_class[800:], positive_class[800:],axis=0)

# Shuffle datasets to randomize train, validation and test set selection.
np.random.shuffle(test)
np.random.shuffle(validation)
np.random.shuffle(train)

# Save data
np.savetxt('DS1_train.csv', train, delimiter = ',')
np.savetxt('DS1_validation.csv', validation, delimiter = ',')
np.savetxt('DS1_test.csv', test, delimiter = ',')

### Question 2 
#### We first consider the GDA model as seen in class:  given the class variable, the data are assumed to be Gaussians with different means for different classes but with the same covariance matrix.  This model can formally be specified as follows:
#### Y∼Bernoulli(π),  X|Y = j ∼ N(μj,Σ)
#### Estimate  the  parameters  of  the  GDA  model  using  the  maximum  likelihood  approach.
1.  For DS1, report the best fit accuracy, precision, recall and F-measure achieved by the classifier.
2.  Report the coefficients learnt.

In [3]:
# Load train, validation and test sets.
test = np.loadtxt('DS1_test.csv',delimiter = ',')
validation = np.loadtxt('DS1_validation.csv',delimiter = ',')
train = np.loadtxt('DS1_train.csv',delimiter = ',')

def GDA(train):
    # Get N1(negatives) and N2(positives) from the total N examples.
    # Also get the sample means for μ1(m1) and μ2 (m2).
    N = len(train)
    N1 = 0.0
    N2 = 0.0
    m1 = np.zeros(20) #len(train[0] - 1)
    m2 = np.zeros(20)

    for row in train:
        if row[-1] == 0:
            N1 += 1
            m1 += row[:-1]
        else:
            N2 += 1
            m2 += row[:-1]     

    m1 /= N1
    m2 /= N2
    p1 = N1/N
    p2 = 1 - p1

    #To get covariance matrix we need S1 and S2.
    row_mean = 0
    S1 = 0
    S2 = 0
    for row in train:
        if row[-1]==0:
            row_mean = np.array(row[:-1]) - m1
            row_mean = np.reshape(row_mean,(20,1))       
            S1 += row_mean.dot(row_mean.T)
        else:
            row_mean = np.array(row[:-1]) - m2
            row_mean = np.reshape(row_mean,(20,1))       
            S2 += row_mean.dot(row_mean.T)

    S1 /= N1
    S2 /= N2

    # Use S1, S2 to get covariance matrix
    cov_matrix = p1*S1 + p2*S2

    # Use formulae form lecture to obtain w0 and w1 decision boundary parameters.
    # ω1 = cov^-1(μ1 - μ2)
    # ω0 = -0.5*μ1^T*cov^-1*μ1 + 0.5*μ2^T*cov^-1*μ2 + ln(N1/N2)
    inverse_cov = np.linalg.inv(cov_matrix)
    w = inverse_cov.dot(m1-m2)
    w0 = -0.5 * (m1.T).dot(inverse_cov).dot(m1) + \
    0.5 * (m2.T).dot(inverse_cov).dot(m2) + np.log(N1/N2)

    print("Coefficients learned:")
    print('w0:', w0)
    print('w:', w)
    return w0, w

w0, w = GDA(train)

Coefficients learned:
w0: 25.873030313005195
w: [ 13.73068895  -8.39076669  -5.75281745  -2.86687715  -9.40174385
  -3.9064478   16.32372139 -22.91392594 -27.90111292   8.83033948
 -12.50578729 -11.98214711  14.9072422   12.22579561  -5.07489982
  12.29990028  28.31986351  -6.31844394  -0.55656195  -4.84331305]


In [4]:
# Helper functions for calulations of model prediction.

# Activation function is a sigmoid.
def sigmoid(x):
    return 1/(1+np.exp(-x))

# Evaluate linear model and pass through activation function.
def probability_negative(x, w0, w):
    linear_model = w0 + w.dot(x)
    return sigmoid(linear_model)

In [5]:
# Get model precision, accuracy, recall and F1.
def test_GDA(test, w0, w):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    total_instances = 0

    for row in test:
        # True negative case
        if row[-1] == 0:
            if probability_negative(row[:-1], w0, w) >= 0.5:
                true_negatives += 1
            else:
                false_positives += 1
        # True positive case
        else:
            if probability_negative(row[:-1], w0, w) < 0.5:
                true_positives += 1
            else:
                false_negatives += 1

    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    accuracy = (true_positives + true_negatives)/(true_positives + false_negatives + true_negatives + false_positives)
    f = 2*precision*recall/(precision+recall)
    print('Accuracy:',accuracy)
    print('Precision:',precision)
    print('Recall:',recall)
    print('F measure:',f)
    
test_GDA(test, w0, w)

Accuracy: 0.96375
Precision: 0.9557739557739557
Recall: 0.9725
F measure: 0.9640644361833953


### Question 3
#### For DS1, use k-NN to learn a classifier.  Repeat the experiment for different values of k and report the performance for each value.  We will compare this non-linear classifier to the linear approach, and find out how powerful linear classifiers can be. 
1.  Does this classifier perform better than GDA or worse?  Are there particular values of k which perform better?  Why does this happen ?  Use F1-Measure for model selection.
2. Report the best fit accuracy, precision, recall and f-measure achieved by this classifier

In [6]:
def kNN(x,k):
    neighbors = []
    prob_positive = 0.0
    for i, row in enumerate(train):
        distance = np.linalg.norm(x - row[:-1])
        if len(neighbors) < k:
            neighbors.append([distance, i])
        else:
            neighbors = sorted(neighbors, key = lambda x: x[0])
            if distance < neighbors[k-1][0]:
                neighbors[-1] = [distance,i]       
    for neighbor in neighbors:
        prob_positive += train[neighbor[1]][-1]
    prob_positive /= k 
    return prob_positive

In [7]:
# Get model precision, accuracy, recall and F1.
def test_knn(k, test):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    total_instances = 0

    for row in test:
        # True negative case
        if row[-1] == 0:
            if kNN(row[:-1],k) <= 0.5:
                true_negatives += 1
            else:
                false_positives += 1
        # True positive case
        else:
            if kNN(row[:-1],k) > 0.5:
                true_positives += 1
            else:
                false_negatives += 1

    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    accuracy = (true_positives + true_negatives)/(true_positives + false_negatives + true_negatives + false_positives)
    f = 2 * precision * recall/(precision + recall)
    print('For k =',k)
    print('Accuracy:',accuracy)
    print('Precision:',precision)
    print('Recall:',recall)
    print('F measure:',f)
    print()
    return 

for k in range(1, 10):
    test_knn(k, test)
for k in range(20, 100, 10):
    test_knn(k, test)

For k = 1
Accuracy: 0.50875
Precision: 0.5084745762711864
Recall: 0.525
F measure: 0.5166051660516606

For k = 2
Accuracy: 0.51
Precision: 0.5185185185185185
Recall: 0.28
F measure: 0.36363636363636365

For k = 3
Accuracy: 0.53375
Precision: 0.5338345864661654
Recall: 0.5325
F measure: 0.5331664580725908

For k = 4
Accuracy: 0.54125
Precision: 0.56
Recall: 0.385
F measure: 0.4562962962962963

For k = 5
Accuracy: 0.525
Precision: 0.524390243902439
Recall: 0.5375
F measure: 0.5308641975308641

For k = 6
Accuracy: 0.53875
Precision: 0.5508196721311476
Recall: 0.42
F measure: 0.4765957446808511

For k = 7
Accuracy: 0.5425
Precision: 0.541871921182266
Recall: 0.55
F measure: 0.5459057071960298

For k = 8
Accuracy: 0.52625
Precision: 0.5329153605015674
Recall: 0.425
F measure: 0.4728789986091794

For k = 9
Accuracy: 0.54375
Precision: 0.543424317617866
Recall: 0.5475
F measure: 0.5454545454545454

For k = 20
Accuracy: 0.535
Precision: 0.5362694300518135
Recall: 0.5175
F measure: 0.5267175572

### Question 4
#### Now instead of having a single multivariate Gaussian distribution per class, each class is  going  to  be  generated  by  a  mixture  of  3  Gaussians.   For  each  class,  we’ll  define 3  Gaussians,  with  the  first  Gaussian  of  the  first  class  sharing  the  covariance  matrix with  the  first  Gaussian  of  the  second  class  and  so  on.   For  both  the  classes,  fix  the mixture probability as (0.1,0.42,0.48) i.e.  the sample has arisen from first Gaussian with probability 0.1, second with probability 0.42 and so on.  Mean for three Gaussians in the positive class are given as DS2-c1-m1, DS2-c1-m2, DS2-c1-m3.  Mean for three Gaussians in the negative class are gives as DS2-c2-m1, DS2-c2-m2, DS2-c2-m3.  Corresponding 3 covariance  matrices  are  given  as  DS2-cov-1,  DS2-cov-2  and  DS2-cov-3.   Now  sample from this distribution and generate the dataset similar to question 1.  Call this dataset as DS2, and submit it with your code.  Follow the instructions from Assignment 1 for data submission format.

In [8]:
# Read in the mean and covariance data from file.
ds2_c1_mean_1 = np.loadtxt("Datasets/DS2_c1_m1.txt", delimiter = ',', usecols=range(20))
ds2_c1_mean_2 = np.loadtxt("Datasets/DS2_c1_m2.txt", delimiter = ',', usecols=range(20))
ds2_c1_mean_3 = np.loadtxt("Datasets/DS2_c1_m3.txt", delimiter = ',', usecols=range(20))

ds2_c2_mean_1 = np.loadtxt("Datasets/DS2_c2_m1.txt", delimiter = ',', usecols=range(20))
ds2_c2_mean_2 = np.loadtxt("Datasets/DS2_c2_m2.txt", delimiter = ',', usecols=range(20))
ds2_c2_mean_3 = np.loadtxt("Datasets/DS2_c2_m3.txt", delimiter = ',', usecols=range(20))

ds2_cov1 = np.loadtxt("Datasets/DS2_Cov1.txt", delimiter = ',', usecols= range(20))
ds2_cov2 = np.loadtxt("Datasets/DS2_Cov2.txt", delimiter = ',', usecols= range(20))
ds2_cov3 = np.loadtxt("Datasets/DS2_Cov3.txt", delimiter = ',', usecols= range(20))

# Generate the multivariate gaussian distributions and sample each 2000 times.
num = 2000 # number of total examples to generate
pop_dist1 = int(0.1 * num)
pop_dist2 = int(0.42 * num)
pop_dist3 = int(0.48 * num)

negative_class1 = np.random.multivariate_normal(ds2_c1_mean_1,ds2_cov1,pop_dist1)
negative_class2 = np.random.multivariate_normal(ds2_c1_mean_2,ds2_cov2,pop_dist2)
negative_class3 = np.random.multivariate_normal(ds2_c1_mean_3,ds2_cov3,pop_dist3)

positive_class1 = np.random.multivariate_normal(ds2_c2_mean_1,ds2_cov1,pop_dist1)
positive_class2 = np.random.multivariate_normal(ds2_c2_mean_2,ds2_cov2,pop_dist2)
positive_class3 = np.random.multivariate_normal(ds2_c2_mean_3,ds2_cov3,pop_dist3)

# Stack the three sample arrays for each class
positive_class = np.append(positive_class1,positive_class2,axis=0)
positive_class = np.append(positive_class,positive_class3,axis=0)

negative_class = np.append(negative_class1, negative_class2,axis=0)
negative_class = np.append(negative_class,negative_class3,axis=0)

# Add labels to the data (add a 0 or a 1 on the 21-st column for labeling)
negative_class = np.insert(negative_class, 20, 0,axis=1)
positive_class = np.insert(positive_class, 20, 1,axis=1)

# Sheffle datasets
np.random.shuffle(negative_class)
np.random.shuffle(positive_class)

# Create train, validation and test sets from both sample distributions.
test_2 = np.append(negative_class[:400], positive_class[:400],axis=0)
validation_2 = np.append(negative_class[400:800], positive_class[400:800],axis=0)
train_2 = np.append(negative_class[800:], positive_class[800:],axis=0)

# Save data
np.savetxt('DS2_train.csv', train_2, delimiter = ',')
np.savetxt('DS2_validation.csv', validation_2, delimiter = ',')
np.savetxt('DS2_test.csv', test_2, delimiter = ',')

### Question 5
####  Now perform the experiments in questions 2 and 3 again, but now using DS2.
1.  Estimate  the  parameters  of  the  GDA  model  using  the  maximum  likelihood  approach.
    1.  For DS1, report the best fit accuracy, precision, recall and F-measure achieved by the classifier.
    2.  Report the coefficients learnt.
2.  Does k-NN  classifier  perform  better  than  GDA  or  worse?   Are  there  particular values of k which perform better?  Why does this happen ?
3.  Report the best fit accuracy, precision, recall and f-measure achieved by this classifier.

In [9]:
# Load datasets from file
test_2 = np.loadtxt('DS2_test.csv',delimiter = ',')
validation_2 = np.loadtxt('DS2_validation.csv',delimiter = ',')
train_2 = np.loadtxt('DS2_train.csv',delimiter = ',')

# Test GDA
print('GDA Model Test\n')
w0_2, w_2 = GDA(test_2)
print()
test_GDA(test_2, w0_2, w_2)
print('\n')

# Test k-NN
print('k-NN Model Test\n')
for k in range(1, 10):
    test_knn(k, test_2)
for k in range(20, 50, 10):
    test_knn(k, test_2)

GDA Model Test

Coefficients learned:
w0: 0.09192696230167563
w: [ 0.08120049  0.03562738 -0.0804858   0.02237502  0.10118552 -0.01249639
  0.08101135 -0.0393756  -0.05386362 -0.03594021 -0.13171094  0.06628183
 -0.02801044 -0.0168307  -0.02945508 -0.11210141 -0.00057707  0.0367228
  0.08563488 -0.05488231]

Accuracy: 0.57125
Precision: 0.571072319201995
Recall: 0.5725
F measure: 0.571785268414482


k-NN Model Test

For k = 1
Accuracy: 0.5025
Precision: 0.5024154589371981
Recall: 0.52
F measure: 0.5110565110565112

For k = 2
Accuracy: 0.51
Precision: 0.5185185185185185
Recall: 0.28
F measure: 0.36363636363636365

For k = 3
Accuracy: 0.51375
Precision: 0.5141388174807198
Recall: 0.5
F measure: 0.5069708491761724

For k = 4
Accuracy: 0.525
Precision: 0.5352112676056338
Recall: 0.38
F measure: 0.4444444444444444

For k = 5
Accuracy: 0.52375
Precision: 0.5223529411764706
Recall: 0.555
F measure: 0.5381818181818182

For k = 6
Accuracy: 0.52125
Precision: 0.5271565495207667
Recall: 0.4125
F 