In [26]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove = ('headers', 'footers'))
targets_train = twenty_train['target']

twenty_test = fetch_20newsgroups(subset = 'test', remove = ('headers', 'footers'))
targets_test = twenty_test['target']

groups = np.arange(20)
priori = np.mean(targets_train == groups.reshape(-1, 1), axis = 1)

with open('train_data.txt', 'rt') as file: train_data = file.read().splitlines()
with open('test_data.txt', 'rt') as file: test_data = file.read().splitlines() 
    
#train_data = twenty_train['data']
#test_data = twenty_test['data']

In [32]:

class NewsClassifier:
    def __init__(self, smoothing, **kwargs):
        self.smoothing = smoothing
        self.tfidf = TfidfVectorizer(**kwargs, stop_words = 'english')
        
    def train(self, train_set, targets):

        self.groups = np.sort(np.unique(targets))
        
        fitted = self.tfidf.fit_transform(train_set)
        self.priori = np.mean(targets == self.groups.reshape(-1, 1), axis = 1)
        
        self.probabilities = np.zeros((20, fitted.shape[1]))
        
        for i, group in enumerate(self.groups):
            row = fitted[targets_train == group].sum(axis = 0)
            row += row[row > 0].min() / self.smoothing
            self.probabilities[i] = row / row.sum()
            
        print(self.probabilities)
        
    def score(self, test_set, targets):
        
        pred_test = self.predict(test_set)
        
        return np.mean(pred_test == targets)
    
    def predict(self, input_data):
        fitted_test = self.tfidf.transform(input_data)
        probs_test = fitted_test.dot(np.log(self.probabilities).T) + np.log(self.priori)
        pred_test = self.groups[np.argmax(probs_test, axis = 1)]
        
        return pred_test

In [None]:
d = np.arange(.1, 0.19, .005)
s = np.array([1e-15, .09, .1])

res = np.zeros((d.size, s.size))

for i, df in enumerate(d):
    for j, smooth in enumerate(s):
        print(f'{i * s.size + j + 1} of {res.size}')
        res[i, j] = attempt(min_df = 1, max_df = df, smoothing = smooth, max_weight = 2.5)['Test']
        
best = res.max()
argmax = np.argwhere(res == best)[0]
print(f'Best max_df: {d[argmax[0]]}')
print(f'Best smoothing: {s[argmax[1]]}')
print(f'Best result: {best}')

In [None]:
sens_tot = []
especif_tot = []
vp_tot = []
vn_tot = []

for i in range(20):
    targets_ok = targets_test == i
    pred_ok = pred == i
    VP = np.sum(targets_ok & pred_ok)
    VN = np.sum(~targets_ok & ~pred_ok)
    FP = np.sum(~targets_ok & pred_ok)
    FN = np.sum(targets_ok & ~pred_ok)
    
    sens_tot.append(VP / (VP + FN))
    especif_tot.append(VN / (VN + FP))
    vp_tot.append(VP / (VP + FP))
    vn_tot.append(VN / (VN + FN))
    
print(f'Sensibilidad: {np.round(np.mean(sens_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Especificidad: {np.round(np.mean(especif_tot) * 100, 2)}%') 

print(f'Valor predictivo positivo: {np.round(np.mean(vp_tot) * 100, 2)}%')

# No tiene sentido, porque son 20 clases. Obvio que va a dar alto.
print(f'Valor predictivo negativo: {np.round(np.mean(vn_tot) * 100, 2)}%')

## Preguntas
- $\textbf{Parte 1}$:
    - ¿Va bien 79.3%?
    - ¿Qué más podemos hacer para subirlo?
    - ¿Está bien usar accuracy?
    - ¿Para las otras métricas está bien sacar el promedio?
    
- $\textbf{Parte 2}$:
    - ¿Podemos sacar el priori si da peores resultados?
    - ¿Está bien elegida la sensibilidad como métrica?
    - ¿Está bien reemplazar a los NaN por la media según la clase?    

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

mnb = MultinomialNB(alpha = .1)

cv = TfidfVectorizer(stop_words = 'english')

mnb.fit(cv.fit_transform(train_data), targets_train)

mnb.score(cv.transform(test_data), targets_test)


In [33]:
classifier = NewsClassifier(smoothing = 1, max_df = .1, smooth_idf = False, use_idf = False, sublinear_tf = True)

classifier.train(train_data, targets_train)

print(classifier.score(test_data, targets_test))

[[2.75331650e-05 3.07203371e-04 3.03517382e-06 ... 3.03517382e-06
  3.03517382e-06 3.03517382e-06]
 [4.82645635e-04 1.82063515e-04 2.55565246e-06 ... 2.55565246e-06
  2.55565246e-06 2.55565246e-06]
 [4.26147198e-04 9.07757830e-05 2.02241271e-06 ... 2.02241271e-06
  2.02241271e-06 2.02241271e-06]
 ...
 [8.90668984e-05 8.37701662e-04 9.58630002e-06 ... 2.35517625e-06
  2.35517625e-06 2.35517625e-06]
 [2.04739459e-04 3.43062018e-04 2.81882261e-06 ... 2.81882261e-06
  2.81882261e-06 2.81882261e-06]
 [6.43572696e-05 1.46951001e-04 4.19589796e-06 ... 4.19589796e-06
  4.19589796e-06 4.19589796e-06]]
0.8017790759426447


In [19]:
classifier.score(test_data[len(test_data) // 2:], targets_test[len(targets_test) // 2:])

0.8011152416356877