# Clasificación de 20newsgroups

In [1]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.metrics import F1Score

In [2]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, remove = ['headers', 'footers', 'quotes'])
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, remove = ['headers', 'footers', 'quotes'])
class_num = 20

In [3]:
# descargamos los embeddings de palabras de Fasttext para inglés y descomprimimos el archivo.
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip

--2024-06-26 18:14:26--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.157.254.121, 108.157.254.102, 108.157.254.124, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.157.254.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2024-06-26 18:14:29 (267 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [4]:
# cargamos los embeddings de palabras
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'found {len(embeddings_index)} word vectors')

loading word embeddings...
found 999995 word vectors


In [5]:
# instanciamos el tokenizador
token = Tokenizer(num_words=30000,
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
                char_level=False,
                oov_token="UNK",
                document_count=0)

In [6]:
# fiteamos el tokenizador

token.fit_on_texts(newsgroups_train.data)

In [7]:
# obtenemos los diccionarios idx2word y word2idx
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])
# CHECK QUE EMPIEZA POR 0

In [8]:
# cargamos en una matriz los embeddings de las palabras
# presentes en el vocabulario
embed_dim=300
num_words=len(dictionary)+1
embedding_matrix=np.zeros([num_words,embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [9]:
embedding_matrix.shape

(105374, 300)

In [10]:
# se tokenizan los textos
train_sequences=token.texts_to_sequences(newsgroups_train.data)
test_sequences=token.texts_to_sequences(newsgroups_test.data)

In [32]:
# En este punto seleccionamos el tamaño de contexto a procesar en la variable `max_len`
max_len=2000
train_sequences=pad_sequences(train_sequences,maxlen=max_len)
test_sequences=pad_sequences(test_sequences,maxlen=max_len)

In [33]:
from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout, GRU
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import SGD, Adam, RMSprop

## Modelos probados
* Dos capas de LSTM con 100 y cuatro capas densas (128,64,32,20), con trainable = False y max_len = 500. Eso da un procentaje de 61.01% en val_accuracy. 
* Tres capas de LSTM con 100 y cuatro capas densas (128,64,32,20), con trainable = True y max_len = 500. Eso da un procentaje de 57.4% en val_accuracy. 
* Tres capas de LSTM con 100 con Bidirectional y tres capas densas (64,32,20), con trainable = False y max_len = 1200. Eso da un procentaje de 65.8% en val_accuracy. 
* Tres capas de GRU con 100 con Bidirectional y tres capas densas (64,32,20), con trainable = False y max_len = 1200. Eso da un procentaje de 64.7% en val_accuracy.
* Tres capas de GRU con 120 con Bidirectional y tres capas densas (64,32,20), con trainable = False y max_len = 2000. Eso da un procentaje de 67.21% en val_accuracy.

In [49]:

model = Sequential()

# la primera capa es de embedding entrenable. Recordar que se puede variar el tamaño
# del embedding a entrenar
model.add(Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_shape=(None,), trainable = False))

model.add(((Bidirectional(GRU(120, return_sequences=True)))))
model.add(Dropout(0.2))

model.add((((Bidirectional(GRU(120, return_sequences=True))))))
model.add(Dropout(0.2))

model.add((((Bidirectional(GRU(120))))))
model.add(Dropout(0.2))

model.add(Dense(64, activation='swish'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='swish'))

# Predicción de clasificación con softmax
# La salida es del tamaño del vocabulario
model.add(Dense(class_num, activation='softmax'))


# Clasificación multiple categórica --> loss = categorical_crossentropy
# notar que usamos la versión Sparse para utilizar sólo índices en lugar de OHE
model.compile(loss=SparseCategoricalCrossentropy(), optimizer = Adam(learning_rate=0.001), metrics=['accuracy'])


model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 300)         31612200  
                                                                 
 bidirectional_12 (Bidirect  (None, None, 240)         303840    
 ional)                                                          
                                                                 
 dropout_20 (Dropout)        (None, None, 240)         0         
                                                                 
 bidirectional_13 (Bidirect  (None, None, 240)         260640    
 ional)                                                          
                                                                 
 dropout_21 (Dropout)        (None, None, 240)         0         
                                                                 
 bidirectional_14 (Bidirect  (None, 240)              

In [50]:
early_stopping = EarlyStopping(monitor="val_accuracy",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="max",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)

rlrop = ReduceLROnPlateau(
    monitor = "val_accuracy",
    factor = 0.25,
    patience = 3,
    verbose = 1,
    min_lr = 0.5e-5
)
history = model.fit(train_sequences, newsgroups_train.target,
                    batch_size=128,
                    epochs=100,
                    validation_split=0.2,
                    callbacks=[early_stopping, rlrop]
                    )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 24: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 28: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 29/100
Epoch 30/100


In [51]:
import numpy as np

predictions = model.predict(test_sequences)
predictions = predictions.argmax(axis=1)
test_accuracy = np.sum(predictions == newsgroups_test.target) / len(newsgroups_test.target)




In [52]:
print(test_accuracy)

0.6123207647371216


In [53]:
from sklearn.metrics import f1_score
#F1
predict_f1 = f1_score(newsgroups_test.target, predictions, average= "macro")
print(predict_f1)

0.6016668593920433


No llegó a mejor que el primer TP, pero es el mejor porcentaje que llegué. En el primer TP, se logró aproximadamente un 67% en test. En este TP, se logró aproximadamente un 60% en test. 