In [2]:
import os
import pandas as pd
from string import ascii_lowercase
import numpy as np
import random
import math


training_data = [] # each item in data is a list of characters from a document
training_labels = [] # each item in labels is the corresponding label 'e', 'j', or 's' of that document
test_data = []
test_labels = []

alphabet = list(ascii_lowercase) + [' '] #alphabet plus the space

# read in text data
data_dir = 'languageID'
for f in os.listdir(data_dir):
    num = int(f[1:-4])
    if num <= 9:
        #print('train', f)
        path = os.path.join(data_dir, f)
        chars = []
        for c in open(path).read():
            if c in alphabet:
                chars.append(c)

        label = f[0]

        training_data.append(chars)
        training_labels.append(label)
    else:
        #print('test', f)
        path = os.path.join(data_dir, f)
        chars = []
        for c in open(path).read():
            if c in alphabet:
                chars.append(c)

        label = f[0]

        test_data.append(chars)
        test_labels.append(label)

classes = sorted(list(set(training_labels)))


# get character counts of each document
char_counts = []
for d in training_data:
    counts = {c:d.count(c) for c in alphabet}
    char_counts.append(counts)

char_counts = pd.DataFrame(char_counts)
char_counts['label'] = training_labels

# calculate the prior probabilities of each class
smoothing_parameter = 0.5
sum_counts = char_counts.groupby('label').sum()
total_counts = sum_counts.sum(axis=1)
prior_probs = (total_counts + smoothing_parameter) / (total_counts.sum() + smoothing_parameter*len(classes))
print(list(prior_probs))
prior_probs = {}
for y in classes:
    count_y = training_labels.count(y)
    total_count = len(training_labels)
    p_y = (count_y + smoothing_parameter) / (total_count + smoothing_parameter * len(classes))
    prior_probs[y] = p_y
prior_probs = pd.DataFrame.from_dict(prior_probs, orient='index')


smoothing_parameter = 0.5
sum_counts = char_counts.groupby('label').sum()
total_counts = sum_counts.sum(axis=1)
class_cond_probs = (sum_counts + smoothing_parameter).divide(
    total_counts + smoothing_parameter * len(alphabet), axis='rows')
print(class_cond_probs.loc['e'])
print(class_cond_probs.loc['j'])
print(class_cond_probs.loc['s'])

# function to predict the label of a test input x
def predict(x):
    char_counts_x = {c: x.count(c) for c in alphabet}
    char_counts_x = pd.Series(char_counts_x)
    
    log_prob_x_given_y = {}
    for y in classes:
        log_prob_x_given_y[y] = (char_counts_x * np.log(class_cond_probs.loc[y])).sum()
    log_prob_x_given_y = pd.Series(log_prob_x_given_y)

    log_prob_y_given_x = {}
    for y in classes:
        log_prob_y_given_x[y] = log_prob_x_given_y.loc[y].item() + np.log(prior_probs.loc[y].item())
    log_prob_y_given_x = pd.Series(log_prob_y_given_x)

    y_pred = log_prob_y_given_x.idxmax()
    return y_pred

# prediction on test input from e10.txt
x = [c for c in open(os.path.join(data_dir, 'e10.txt')).read() if c in alphabet]
e_10_word_count = pd.Series({c: x.count(c) for c in alphabet})
y_pred = predict(x)
print(y_pred)
print(e_10_word_count)

r = 0
i=0
for c in alphabet:
    r = r + e_10_word_count[c]*math.log(class_cond_probs.loc['e'][i])
    i = i+1
print(r)
r = 0
i=0
for c in alphabet:
    r = r + e_10_word_count[c]*math.log(class_cond_probs.loc['j'][i])
    i = i+1
print(r)
r = 0
i=0
for c in alphabet:
    r = r + e_10_word_count[c]*math.log(class_cond_probs.loc['s'][i])
    i = i+1
print(r)

    
    
# shuffle x and predict again
x_shuffled = random.sample(x, len(x))
y_pred_shuffled = predict(x_shuffled)

#evaluate predictions on all test data
y_preds_test = [predict(x) for x in test_data]

def make_confusion_matrix(y_true, y_pred):
    if len(y_true) != len(y_pred):
        print('error: y_true and y_pred are different lengths')
        exit()
    confusion_matrix = pd.DataFrame(data=np.zeros((len(classes), len(classes))),
                                    index=classes,
                                    columns=classes)
    for y_t, y_p in zip(y_true, y_pred):
        confusion_matrix.loc[y_p][y_t] = confusion_matrix.loc[y_p][y_t] + 1

    return confusion_matrix

confusion_matrix_original = make_confusion_matrix(y_true=test_labels, y_pred=y_preds_test)
print(confusion_matrix_original )


#shuffling 
y_preds_test_shuffled = [predict(random.sample(x, len(x))) for x in test_data]
confusion_matrix_shuffled = make_confusion_matrix(y_true=test_labels, y_pred=y_preds_test_shuffled)
print(confusion_matrix_shuffled)

[0.3314335192958997, 0.3133925929173471, 0.3551738877867532]
a    0.060169
b    0.011135
c    0.021510
d    0.021973
e    0.105369
f    0.018933
g    0.017479
h    0.047216
i    0.055411
j    0.001421
k    0.003734
l    0.028977
m    0.020519
n    0.057922
o    0.064464
p    0.016752
q    0.000562
r    0.053825
s    0.066182
t    0.080126
u    0.026664
v    0.009285
w    0.015496
x    0.001156
y    0.013844
z    0.000628
     0.179250
Name: e, dtype: float64
a    0.131766
b    0.010867
c    0.005486
d    0.017226
e    0.060205
f    0.003879
g    0.014012
h    0.031762
i    0.097033
j    0.002341
k    0.057409
l    0.001433
m    0.039799
n    0.056711
o    0.091163
p    0.000874
q    0.000105
r    0.042804
s    0.042175
t    0.056990
u    0.070617
v    0.000245
w    0.019742
x    0.000035
y    0.014151
z    0.007722
     0.123449
Name: j, dtype: float64
a    0.104560
b    0.008233
c    0.037526
d    0.039746
e    0.113811
f    0.008603
g    0.007184
h    0.004533
i    0.049860
j    0.00