In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove = ('headers', 'footers'))
targets_train = twenty_train['target']

twenty_test = fetch_20newsgroups(subset = 'test', remove = ('headers', 'footers'))
targets_test = twenty_test['target']

groups = np.arange(20)
priori = np.mean(targets_train == groups.reshape(-1, 1), axis = 1)

#base = np.arange(targets_test.size)
#val_idx = np.random.choice(base, size = int(targets_test.size / 2), replace = False)
#test_idx = base[np.all(base != val_idx.reshape(-1, 1), axis = 0)]
#val_idx = [0]#np.arange(int(targets_test.size / 2))
test_idx = np.arange(targets_test.size)

def attempt(smoothing, **kwargs):
    cv = CountVectorizer(**kwargs, stop_words = 'english')
    fitted = cv.fit_transform(twenty_train['data'])
    
    #f2 = fitted.toarray()
    #words = np.array(cv.get_feature_names())

    probabilities = np.zeros((20, fitted.shape[1]))
    
    #df = np.count_nonzero(f2, axis = 0) / targets_train.size
    #plt.hist(df * 100, bins = 80)
    #plt.yscale('log')
    #plt.xlabel('%')
    #plt.ylabel('Repeticiones')
   
    for group in groups:
        row = fitted[targets_train == group].sum(axis = 0) + smoothing
        probabilities[group] = row[0] / row.sum()

    #np.delete(f2)
    #del words
    fitted_test = cv.transform(twenty_test['data']).toarray().T
        
    #probs_validation = np.dot(np.log(probabilities), fitted_test[:, val_idx]) + np.log(priori).reshape(-1, 1)
    probs_test = np.dot(np.log(probabilities), fitted_test) + np.log(priori).reshape(-1, 1)

    #pred_validation = np.argmax(probs_validation, axis = 0)
    pred_test = np.argmax(probs_test, axis = 0)

    #acc_validation = np.mean(pred_validation == targets_test[val_idx])
    acc_test = np.mean(pred_test == targets_test)
    
    return {'Test': (acc_test * 100).round(2)}

In [12]:
d = 700
s = .1
print(attempt(min_df = 1, max_df = d, smoothing = s))

{'Test': 76.95}


In [None]:
sens_tot = []
especif_tot = []
vp_tot = []
vn_tot = []

for i in range(20):
    targets_ok = targets_test == i
    pred_ok = pred == i
    VP = np.sum(targets_ok & pred_ok)
    VN = np.sum(~targets_ok & ~pred_ok)
    FP = np.sum(~targets_ok & pred_ok)
    FN = np.sum(targets_ok & ~pred_ok)
    
    sens_tot.append(VP / (VP + FN))
    especif_tot.append(VN / (VN + FP))
    vp_tot.append(VP / (VP + FP))
    vn_tot.append(VN / (VN + FN))
    
print(f'Sensibilidad: {np.round(np.mean(sens_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Especificidad: {np.round(np.mean(especif_tot) * 100, 2)}%') 

print(f'Valor predictivo positivo: {np.round(np.mean(vp_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Valor predictivo negativo: {np.round(np.mean(vn_tot) * 100, 2)}%')

## Preguntas
- $\textbf{Parte 1}$:
    - ¿Va bien 79.3%?
    - ¿Qué más podemos hacer para subirlo?
    - ¿Está bien usar accuracy?
    - ¿Para las otras métricas está bien sacar el promedio?
    
- $\textbf{Parte 2}$:
    - ¿Podemos sacar el priori si da peores resultados?
    - ¿Está bien elegida la sensibilidad como métrica?
    - ¿Está bien reemplazar a los NaN por la media según la clase?    

In [None]:
print(twenty_train['DESCR'])

In [4]:
print(twenty_train['target_names'])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
