In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove = ('headers', 'footers'))
targets_train = twenty_train['target']

twenty_test = fetch_20newsgroups(subset = 'test', remove = ('headers', 'footers'))
targets_test = twenty_test['target']

groups = np.arange(20)
priori = np.mean(targets_train == groups.reshape(-1, 1), axis = 1)

with open('train_data.txt', 'rt') as file: train_data = file.read().splitlines()
with open('test_data.txt', 'rt') as file: test_data = file.read().splitlines() 

In [2]:
def attempt(smoothing, max_weight, **kwargs):
    cv = CountVectorizer(**kwargs, stop_words = 'english')

    fitted = cv.fit_transform(train_data)
    #f2 = fitted.toarray()

    probabilities = np.zeros((20, fitted.shape[1]))
    
    #df = np.count_nonzero(f2, axis = 0) / targets_train.size
    #plt.hist(df * 100, bins = 80)
    #plt.yscale('log')
    #plt.xlabel('%')
    #plt.ylabel('Repeticiones')

    for group in groups:
        row = fitted[targets_train == group].sum(axis = 0) + smoothing
        row[0, np.array(row == row.max())[0]] *= max_weight
        probabilities[group] = row / row.sum()
        
    #np.delete(f2)
    
    fitted_test = cv.transform(test_data)#.toarray().T
        
    probs_test = fitted_test.dot(np.log(probabilities).T) + np.log(priori)
    pred_test = np.argmax(probs_test, axis = 1)

    acc_test = np.mean(pred_test == targets_test)
    
    return {'Test': (acc_test * 100).round(2)}

In [4]:
d = np.arange(.1, 0.19, .005)
s = np.array([1e-15, .09, .1])

res = np.zeros((d.size, s.size))

for i, df in enumerate(d):
    for j, smooth in enumerate(s):
        print(f'{i * s.size + j + 1} of {res.size}')
        res[i, j] = attempt(min_df = 1, max_df = df, smoothing = smooth, max_weight = 2.5)['Test']
        
best = res.max()
argmax = np.argwhere(res == best)[0]
print(f'Best max_df: {d[argmax[0]]}')
print(f'Best smoothing: {s[argmax[1]]}')
print(f'Best result: {best}')

1 of 54
2 of 54
3 of 54
4 of 54
5 of 54
6 of 54
7 of 54
8 of 54
9 of 54
10 of 54
11 of 54
12 of 54
13 of 54
14 of 54
15 of 54
16 of 54
17 of 54
18 of 54
19 of 54
20 of 54
21 of 54
22 of 54
23 of 54
24 of 54
25 of 54
26 of 54
27 of 54
28 of 54
29 of 54
30 of 54
31 of 54
32 of 54
33 of 54
34 of 54
35 of 54
36 of 54
37 of 54
38 of 54
39 of 54
40 of 54
41 of 54
42 of 54
43 of 54
44 of 54
45 of 54
46 of 54
47 of 54
48 of 54
49 of 54
50 of 54
51 of 54
52 of 54
53 of 54
54 of 54
Best max_df: 0.16000000000000006
Best smoothing: 0.09
Best result: 76.93


In [None]:
sens_tot = []
especif_tot = []
vp_tot = []
vn_tot = []

for i in range(20):
    targets_ok = targets_test == i
    pred_ok = pred == i
    VP = np.sum(targets_ok & pred_ok)
    VN = np.sum(~targets_ok & ~pred_ok)
    FP = np.sum(~targets_ok & pred_ok)
    FN = np.sum(targets_ok & ~pred_ok)
    
    sens_tot.append(VP / (VP + FN))
    especif_tot.append(VN / (VN + FP))
    vp_tot.append(VP / (VP + FP))
    vn_tot.append(VN / (VN + FN))
    
print(f'Sensibilidad: {np.round(np.mean(sens_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Especificidad: {np.round(np.mean(especif_tot) * 100, 2)}%') 

print(f'Valor predictivo positivo: {np.round(np.mean(vp_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Valor predictivo negativo: {np.round(np.mean(vn_tot) * 100, 2)}%')

## Preguntas
- $\textbf{Parte 1}$:
    - ¿Va bien 79.3%?
    - ¿Qué más podemos hacer para subirlo?
    - ¿Está bien usar accuracy?
    - ¿Para las otras métricas está bien sacar el promedio?
    
- $\textbf{Parte 2}$:
    - ¿Podemos sacar el priori si da peores resultados?
    - ¿Está bien elegida la sensibilidad como métrica?
    - ¿Está bien reemplazar a los NaN por la media según la clase?    

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha = .05)

cv = CountVectorizer()

mnb.fit(cv.fit_transform(train_data), targets_train)

mnb.score(cv.transform(test_data), targets_test)
