# Multinomial Models 

In [92]:
# inclusion
import sklearn.datasets
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg as linalg
import sklearn as sk
from Data.load import load_data, split_data

In [93]:
# Utilities
def vcol(v):
    return np.array(v).reshape(v.size, 1)
def vrow(v):
    return np.array(v).reshape(1, v.size)


In [94]:
I, Pu, Pa = load_data()
ITR, ITE = split_data(I, 25)
PuTR, PuTE = split_data(Pu, 25)
PaTR, PaTE = split_data(Pa, 25)

In [95]:
# Build dictionary
dic = {}
xi = 0.001
n_i = 0
n_pu = 0
n_pa = 0

for tercet in ITR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        dic[cleaned_word] = xi
        n_i += 1
for tercet in PuTR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        dic[cleaned_word] = xi
        n_pu += 1
for tercet in PaTR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        dic[cleaned_word] = xi
        n_pa += 1

# Count occurencies for every word
# Inferno
inf_count = dic.copy()

for tercet in ITR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        inf_count[cleaned_word] += 1

# Purgatorio
purg_count = dic.copy()

for tercet in PuTR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        purg_count[cleaned_word] += 1

# Paradiso
par_count = dic.copy()

for tercet in PaTR:
    words = tercet.split()
    for word in words:
        cleaned_word = word.lower()
        par_count[cleaned_word] += 1


In [96]:
# compute logl
def calc_logl(tercet):
    logl_inf = 0
    logl_pur = 0
    logl_par = 0

    for word in tercet.split():
        cleaned_word = word.lower()
        if cleaned_word in inf_count:
            logl_inf += np.log(inf_count[cleaned_word]/n_i)
        if cleaned_word in purg_count:
            logl_pur += np.log(purg_count[cleaned_word]/n_pu)
        if cleaned_word in par_count:
            logl_par += np.log(par_count[cleaned_word]/n_pa)
    return logl_inf, logl_pur, logl_par

In [101]:
# compute scores
# inferno
S = np.zeros([3, len(ITE)])
for i in range(len(ITE)):
    tercet = ITE[i]
    S[0, i], S[1, i], S[2, i] = calc_logl(tercet)

S_joint = np.exp(S) * 1/3
S_marginal = vrow(S_joint.sum(0))

S_post = S_joint/S_marginal
L_pred = S_post.argmax(0)
acc = np.sum((L_pred == 0))/len(ITE)
print(f"Accuracy for Inferno is: {acc*100}%")

# purgatorio
S = np.zeros([3, len(PuTE)])
for i in range(len(PuTE)):
    tercet = PuTE[i]
    S[0, i], S[1, i], S[2, i] = calc_logl(tercet)

S_joint = np.exp(S) * 1/3
S_marginal = vrow(S_joint.sum(0))

S_post = S_joint/S_marginal
L_pred = S_post.argmax(0)
acc = np.sum((L_pred == 1))/len(PuTE)
print(f"Accuracy for Purgatorio is: {acc*100}%")

# paradiso
S = np.zeros([3, len(PaTE)])
for i in range(len(PaTE)):
    tercet = PaTE[i]
    S[0, i], S[1, i], S[2, i] = calc_logl(tercet)

S_joint = np.exp(S) * 1/3
S_marginal = vrow(S_joint.sum(0))

S_post = S_joint/S_marginal
L_pred = S_post.argmax(0)
acc = np.sum((L_pred == 2))/len(PaTE)
print(f"Accuracy for Paradiso is: {acc*100}%")


Accuracy for Inferno is: 59.375%
Accuracy for Purgatorio is: 49.23076923076923%
Accuracy for Paradiso is: 58.46153846153847%


In [100]:
acc

np.float64(0.59375)