In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from numpy.linalg import norm
from scipy import stats
import math
from sklearn.metrics import confusion_matrix

I tackle the task of classifying by topic posts made in six different internet newsgroups –
comp.windows.x, rec.sport.baseball, sci.med, misc.forsale, talk.politics.mideast and talk.religion.misc – that
correspond to labels 1, . . . , 6 respectively.

Each line of the training or test set is a feature vector of length 819, followed by a label (1, . . . , 6).
The first line in the dictionary is the word
that corresponds to the first coordinate, the second line to the second coordinate, and so on.

In [3]:
#Function to clean the txt files and seperate the labels from the actual vectors
def clean_txt(path):
    f = open(path,'r')
    arr = (f.read()).split("\n")
    arr = arr[:-1]
    labels = []
    for vector in arr:
        labels.append(int(vector[-1]))
    vectors = []
    for vector in arr:
        vectors.append(vector.split(' '))
    for i in range(len(vectors)):
        vectors[i] = list(map(int,vectors[i][:-1]))
    return np.array(vectors),np.array(labels)

In [4]:
test_vectors,test_labels = clean_txt("pa3test.txt")
train_vectors,train_labels = clean_txt("pa3train.txt")

In [5]:
test_vectors

array([[ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  3, ...,  0,  0,  0],
       [ 0,  0, 45, ...,  0,  0,  0],
       ...,
       [ 0,  0,  1, ...,  0,  0,  0],
       [ 0,  0,  4, ...,  0,  0,  0],
       [ 0,  0,  4, ...,  0,  0,  0]])

In [6]:
# Finds indeces of vectors with labels either 1 or 2 in the train vectors
df1_train = pd.DataFrame(np.loadtxt("pa3train.txt"))
subset_train_df = df1_train[df1_train[819].isin([1.0,2.0])]
subset_train_index = subset_train_df.index
# Gets subset of train vectors with labels of 1 or 2
subset_train_vectors = []
subset_train_labels = np.array([])
for i in subset_train_index:
    subset_train_vectors.append(train_vectors[i])
    subset_train_labels = np.append(subset_train_labels, train_labels[i])
subset_train_vectors = np.array(subset_train_vectors)

In [7]:
# Finds indeces of vectors with labels either 1 or 2 in the test vectors
df1_test = pd.DataFrame(np.loadtxt("pa3test.txt"))
subset_test_df = df1_test[df1_test[819].isin([1.0,2.0])]
subset_test_index = subset_test_df.index
# Gets subset of test vectors with labels of 1 or 2
subset_test_vectors = []
subset_test_labels = np.array([])
for i in subset_test_index:
    subset_test_vectors.append(test_vectors[i])
    subset_test_labels = np.append(subset_test_labels, test_labels[i])
subset_test_vectors = np.array(subset_test_vectors)

In [8]:
#Switch from 2 to -1 
subset_train_labels[subset_train_labels == 2] = -1
subset_test_labels[subset_test_labels == 2] = -1

### Perceptron 

In [9]:
#Perceptron Algorithm

def perceptron(data, labels, iterations):
    w = np.array([0]*819)
    j = 0
    while j != iterations:
        for i in range(len(labels)):
            y = labels[i]
            x = data[i]
            if y * np.dot(w,x) <= 0:
                w = w + (y*x)
            else:
                w = w
        j +=1
    return w
w = perceptron(subset_train_vectors,subset_train_labels,1)

In [10]:
#Tested against train data for 1 iteration
predicted_labels = [np.dot(subset_train_vectors[i] , w) for i in range(len(subset_train_labels))]
train_error_1iter = train_error_1iter = np.mean(np.sign(predicted_labels) != subset_train_labels)
print("Train error after 1 iteration = " + str(train_error_1iter))

Train error after 1 iteration = 0.04128440366972477


In [11]:
#Repeat perceptron algo for 2,3,4 passes on test and train data
iterations = [2,3,4]
train_errors = []
test_errors = []

for j in iterations:
    train_w = perceptron(subset_train_vectors,subset_train_labels,j)
    test_w = perceptron(subset_test_vectors, subset_test_labels,j)
    
    predicted_train_labels = [np.dot(subset_train_vectors[i] , train_w) for i in range(len(subset_train_labels))]
    predicted_test_labels = [np.dot(subset_test_vectors[i] , train_w) for i in range(len(subset_test_labels))]
    
    train_error = train_error_1iter = np.mean(np.sign(predicted_train_labels) != subset_train_labels)
    test_error = test_error_1iter = np.mean(np.sign(predicted_test_labels) != subset_test_labels)
    
    train_errors.append(train_error)
    test_errors.append(test_error)

In [12]:
pd.DataFrame({'TrainError':train_errors,'TestError':test_errors}, index = iterations)

Unnamed: 0,TrainError,TestError
2,0.040367,0.061008
3,0.021101,0.045093
4,0.019266,0.047745


Implementing a linear classification algorithm using logistic regression.

In [13]:
learning_rate = 0.001
def perceptron2(data, labels, iterations):
    w = np.array([0]*819)
    j = 0
    while j != iterations:
        phrase = 0
        for i in range(len(labels)):
            y = labels[i]
            x = data[i]
            phrase += (y*x)/(1 + np.exp(y * np.dot(w,x)))
        w = w + learning_rate * phrase
        j +=1
    return w

In [14]:
def sigmoid(w,x):
    return 1 / (1 + np.exp(-np.dot(w,x)))

In [15]:
w2 =perceptron2(subset_train_vectors, subset_train_labels,2)

predicted_labels2 = np.array([sigmoid(w2,subset_train_vectors[i]) for i in range(len(subset_train_labels))])
predicted_labels2[predicted_labels2 >= 0.5] = 1
predicted_labels2[predicted_labels2 < 0.5] = -1

train_error_2iter = np.mean(predicted_labels2 != subset_train_labels)
print("Train error after 2 iterations = " + str(train_error_2iter))

Train error after 2 iterations = 0.4944954128440367


  return 1 / (1 + np.exp(-np.dot(w,x)))


In [16]:
#Repeat perceptron2 algo for 10,50,100 passes on test and train data
iterations = [10,50,100]
train_errors = []
test_errors = []

for j in iterations:
    train_w = perceptron2(subset_train_vectors,subset_train_labels,j)
    test_w = perceptron2(subset_test_vectors, subset_test_labels,j)
    
    predicted_train_labels = np.array([sigmoid(train_w,subset_train_vectors[i]) for i in range(len(subset_train_labels))])
    predicted_test_labels = np.array([sigmoid(train_w,subset_test_vectors[i]) for i in range(len(subset_test_labels))])
    
    predicted_train_labels[predicted_train_labels >= 0.5] = 1
    predicted_train_labels[predicted_train_labels < 0.5] = -1
    predicted_test_labels[predicted_test_labels >= 0.5] = 1
    predicted_test_labels[predicted_test_labels < 0.5] = -1
    
    train_error = np.mean(predicted_train_labels != subset_train_labels)
    test_error = np.mean(predicted_test_labels != subset_test_labels)
    
    train_errors.append(train_error)
    test_errors.append(test_error)

  phrase += (y*x)/(1 + np.exp(y * np.dot(w,x)))
  return 1 / (1 + np.exp(-np.dot(w,x)))


In [17]:
pd.DataFrame({'TrainError':train_errors,'TestError':test_errors}, index = iterations)

Unnamed: 0,TrainError,TestError
10,0.294495,0.297082
50,0.036697,0.061008
100,0.017431,0.045093


#### Three coordinates in w with the highest and lowest values. These are the words that are most positively and negatively correlated, from the dictionary text file.

In [18]:
words = str(open('pa3dictionary.txt').read()).split(' \n')[:-1]
w3 = perceptron(subset_train_vectors,subset_train_labels,3)
pd3 = pd.DataFrame({"Values":w3,"Words":words})
smallest = pd3.sort_values('Values',ascending = True).head(3)
largest = pd3.sort_values('Values',ascending = False).head(3)

In [19]:
smallest

Unnamed: 0,Values,Words
78,-72.0,he
469,-43.0,team
393,-40.0,game


In [20]:
largest

Unnamed: 0,Values,Words
438,130.0,file
466,77.0,program
203,46.0,line


#### Same as above but done on the logistic regression implementation of the linear classifier.

In [21]:
w4 = perceptron2(subset_train_vectors,subset_train_labels,50)
pd4 = pd.DataFrame({"Values":w4,"Words":words})
smallest4 = pd4.sort_values('Values',ascending = True).head(3)
largest4 = pd4.sort_values('Values',ascending = False).head(3)

  phrase += (y*x)/(1 + np.exp(y * np.dot(w,x)))


In [22]:
smallest4

Unnamed: 0,Values,Words
78,-5.053053,he
393,-2.313363,game
58,-2.182689,they


In [23]:
largest4

Unnamed: 0,Values,Words
617,3.042452,window
438,2.090256,file
72,2.049261,use


## One-vs-all multi-class classifier with a Don’t Know Option


In [24]:
# Slightly modified perceptron
def perceptron5(data, labels, iterations,i):
    w = np.array([0]*819)
    j = 0
    while j != iterations:
        for index in range(len(labels)):
            y = labels[index]
            x = data[index]
            if y != i:
                y = -1
            else:
                y = 1
            if y * np.dot(w,x) <= 0:
                w = w + (y*x)
            else:
                w = w
        j +=1
    return w

In [25]:
def labeler(arr):
    arr = list(arr)
    if arr.count(1) == 1:
        return arr.index(1) + 1
    return 'Dont Know'

In [26]:
#Making classifiers 1....6
classifiers = []
for i in range(1,7):
    classifiers.append(perceptron5(train_vectors,train_labels,1,i))

In [27]:
# Predicting labels
predicted_labels5 = []
for x in test_vectors:
    arr = np.sign(np.dot(classifiers,x))
    predicted_labels5.append(labeler(arr))

#### Confusion Matrix 

In [28]:
#Creating conf_matrix
temp = confusion_matrix(list(map(str,predicted_labels5)), list(map(str,test_labels)))
conf_matrix = []
for row in temp:
    conf_matrix.append(list(row[:-1]))

In [29]:
conf_matrix

[[133, 1, 6, 4, 0, 0],
 [2, 126, 6, 5, 2, 2],
 [0, 3, 65, 0, 0, 3],
 [3, 1, 0, 127, 0, 0],
 [3, 6, 13, 1, 125, 13],
 [1, 2, 6, 0, 11, 53],
 [43, 53, 79, 47, 18, 37]]

In [30]:
#Finding sums of each columns to calculate total # of examples with label j
col_sums = [0]*6
for i in range(6):
    for row in conf_matrix:
        col_sums[i] += row[i]

# Dividing every element in each column by the col_sum of that column 
# This finds # of examples with label i classified as j
for i in range(7):
    for j in range(6):
        curr_value = conf_matrix[i][j] 
        conf_matrix[i][j] = curr_value / col_sums[j]

In [31]:
#converting confusion matrix back to arr or arrays
print(np.array(list(map(np.array,conf_matrix))))

[[0.71891892 0.00520833 0.03428571 0.02173913 0.         0.        ]
 [0.01081081 0.65625    0.03428571 0.02717391 0.01282051 0.01851852]
 [0.         0.015625   0.37142857 0.         0.         0.02777778]
 [0.01621622 0.00520833 0.         0.69021739 0.         0.        ]
 [0.01621622 0.03125    0.07428571 0.00543478 0.80128205 0.12037037]
 [0.00540541 0.01041667 0.03428571 0.         0.07051282 0.49074074]
 [0.23243243 0.27604167 0.45142857 0.25543478 0.11538462 0.34259259]]


In [32]:
# Calculating accuracy for each i 
row_sums = []
i = 0
for j in range(6):
    row_sums.append(sum(conf_matrix[j]) - conf_matrix[j][i])
    i+=1

In [33]:
row_sums

[0.06123317805383022,
 0.1036094694790346,
 0.04340277777777779,
 0.021424549549549532,
 0.2475570834809966,
 0.12062060687060688]

a: i = 4

b: i = 5

c: [6][6] with a value of 0.12037037