In [None]:
%pip install numpy scikit-learn



### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [None]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [None]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidfvect = TfidfVectorizer()

In [None]:
# en el atributo `data` accedemos al texto
print(newsgroups_train.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [None]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [None]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'Cantidad de documentos: {X_train.shape[0]}')
print(f'Tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
Cantidad de documentos: 11314
Tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [None]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [None]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [None]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [None]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [None]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [None]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [None]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [None]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([4811, 6635, 4253, ..., 9019, 9016, 8748])

In [None]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [None]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [None]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [None]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred = clf.predict(X_test)

In [None]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

### Desafío 1

## **1**.
Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

In [None]:
import random
import numpy as np

random_indices = random.sample(range(len(newsgroups_train.data)), 5)

### 1er documento

In [None]:
idx = random_indices[0]
print(f"Document Index: {idx}")
print(f"Class: {newsgroups_train.target_names[y_train[idx]]}")
print(newsgroups_train.data[idx])

cossim = cosine_similarity(X_train[idx], X_train)[0]

mostsim = np.argsort(cossim)[::-1][1:6]

print("5 Most Similar Documents:")
for i in mostsim:
    print(f"Document Index {i}: Class - {newsgroups_train.target_names[y_train[i]]}")

Document Index: 677
Class: comp.windows.x
I'm trying to set up an IPX for another group. I copied all the
X stuff that I compiled on my 4/280 (which runs SunOS 4.1.1) using
gcc 2.1, and most things run just fine. however, I did find a
couple of bugs, and when I try to recompile those clients on the IPX
(which runs 4.1.3), I get

ld: Undefined symbol
   _XShapeQueryExtension
   _XShapeCombineMask

I know that I can include libXext and get rid of those messages,
but I can't figure out why I get them on the IPX and not on the
4/280. any ideas?

5 Most Similar Documents:
Document Index 7972: Class - comp.windows.x
Document Index 6117: Class - comp.windows.x
Document Index 7496: Class - comp.windows.x
Document Index 7967: Class - comp.windows.x
Document Index 9623: Class - talk.politics.mideast


El documento habla sobre un problema a la hora de ejecutar un programa. La clasificación del mismo es comp.windows, la cual tiene sentido. Los primeros 4 documentos pertenecen a la misma clase, pero el 5to a politics.mideast.

### 2do documento

In [None]:
idx = random_indices[1]
print(f"Document Index: {idx}")
print(f"Class: {newsgroups_train.target_names[y_train[idx]]}")
print(newsgroups_train.data[idx])

cossim = cosine_similarity(X_train[idx], X_train)[0]

mostsim = np.argsort(cossim)[::-1][1:6]

print("5 Most Similar Documents:")
for i in mostsim:
    print(f"Document Index {i}: Class - {newsgroups_train.target_names[y_train[i]]}")

Document Index: 1858
Class: sci.crypt

I have a question that is a slight variation on the previously mentioned
examples that perhaps people could give me some pointers on (it has been a
couple of years since my Con Law class in college so I hope I am not
missing something obvious here...)

Basic Scenario:

	I set up a bbs that uses public-key encryption and encryption of
	files on disk.  The general setup is designed so that when users 
	connect they send a private key encrypted using the system public
	key and the user's public-private keypair is used to wrap the
	one-time session keys used for encrypting the files on disk.  The
	result of this is that even if I reveal the system private key it
	is impossible for anyone to gain access to the files stored on the
	machine.  What is possible is for someone to use the revealed
	system private key to entice users into revealing thier personal
	private keys during the authentication sequence.

Questions:

	Does the fact that the system pri

El documento habla sobre criptografia, es clasificado como sci.crypt y los 5 documentos mas similares también.

### 3er documento

In [None]:
idx = random_indices[2]
print(f"Document Index: {idx}")
print(f"Class: {newsgroups_train.target_names[y_train[idx]]}")
print(newsgroups_train.data[idx])

cossim = cosine_similarity(X_train[idx], X_train)[0]

mostsim = np.argsort(cossim)[::-1][1:6]

print("5 Most Similar Documents:")
for i in mostsim:
    print(f"Document Index {i}: Class - {newsgroups_train.target_names[y_train[i]]}")

(1, 101631)
Document Index: 7264
Class: comp.os.ms-windows.misc
I am using WFW 2.0c with a Canon BJ10e. The printer driver is that 
which comes with Windows 3.1. Unfortatunately, I am having a problem with 
printing page numbers on the bottom of the page. I can print page number 
on the top of the page, but not on the bottom. Has anybody had a similar 
problem and/or does anybody have a solution for such a problem.

Thanks
pwoodcoc@business.uwo.ca
5 Most Similar Documents:
Document Index 10607: Class - comp.sys.mac.hardware
Document Index 9443: Class - sci.med
Document Index 5090: Class - comp.os.ms-windows.misc
Document Index 348: Class - comp.graphics
Document Index 6057: Class - comp.graphics


El documento habla sobre problemas a la hora de imprimier numeros en el pie de pagina. La clasificación del mismo es comp.os.ms-windows, la cual tiene sentido. 4 de los documentos tienen clases de hardware, gráficos o sistema operativo de windows. Sin embargo, el 2do documento más similar tiene una categoria de (asumo) ciencia.medicina, lo cual parece errado.

Observamos el contenido de el documento con indice 9443 y efectivamente habla sobre el posicionamiento de algún elemento en un documento. Esto es sumamente similar al contenido del documento a comparar.

In [None]:
print(newsgroups_train.data[9443])


It's on page 315, about 2 1/2 inches up from the bottom and an inch in
from the right.

At least we know what some people *haven't* read and remembered.



### 4to documento

In [None]:
idx = random_indices[3]
print(f"Document Index: {idx}")
print(f"Class: {newsgroups_train.target_names[y_train[idx]]}")
print(newsgroups_train.data[idx])

cossim = cosine_similarity(X_train[idx], X_train)[0]

mostsim = np.argsort(cossim)[::-1][1:6]

print("5 Most Similar Documents:")
for i in mostsim:
    print(f"Document Index {i}: Class - {newsgroups_train.target_names[y_train[i]]}")

Document Index: 7350
Class: talk.politics.mideast
Now we have strong evidence of where the CPR really stands.
Unbelievable and disgusting.  It only proves that we must
never forget...


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


Not so unconventional.  Eugenic solutions to the Jewish Problem
have been suggested by Northern Europeans in the past.

  Eugenics: a science that deals with the improvement (as by
  control of human mating) of hereditory qualities of race
  or breed.  -- Webster's Ninth Collegiate Dictionary.


This is nothing more than Feisal Husseini's statement that the
Zionist entity must be disolved by forcing it to "engage" the
surrounding "normal" Arab society.

"a strong mixed stock", "integration of Israeli society into
the Middle East in a graceful manner," these are the phrases
of Nazi racial engineering pure and simple.  As if Israeli
society has no right to exist per se!


"the continued existance of a specific Jewish People overrides
any o

El documento claramente pertenece a la clase talk.politics.mideast y los 5 documentos más similares también.

### 5to documento

In [None]:
idx = random_indices[4]
print(f"Document Index: {idx}")
print(f"Class: {newsgroups_train.target_names[y_train[idx]]}")
print(newsgroups_train.data[idx])

cossim = cosine_similarity(X_train[idx], X_train)[0]

mostsim = np.argsort(cossim)[::-1][1:6]

print("5 Most Similar Documents:")
for i in mostsim:
    print(f"Document Index {i}: Class - {newsgroups_train.target_names[y_train[i]]}")

Document Index: 566
Class: soc.religion.christian

Yes, it's important to realize that all actions have consequences,
and that "rules" were made for our own good.  But to suggest that a
*disease* is a *punishment* for certain types of sin I think is 
taking things much too far.  If we got some kind of mouth disease
for lying, would any of us have mouths left?  What if we developed
blindness every time we lusted after someone or something?  I dare
say all of us would be walking into walls.

Yes, sin can have terrible consequences, but we need to be *real*
careful when saying that the consequences are a *punishment* for 
sin.  The Jews of Jesus's time believed that all sickness was the
result of a sin.  Then Jesus healed a blind man and said that man was
blind to show the glory of God, not because of sin.  If AIDS, or any
other STD is a *punishment" for sexual sin, what do we do with 
diseases like cancer, or multiple sclerosis, which are just as
debilitating and terrible as AIDS, yet ar

El documento claramente pertenece a la clase religion.christian y los 5 documentos más similares también.

## **2**.
Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

Usamos como baseline el modelo dado en la implementación inicial con un F1-Score de 0.5854345727938506.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.85,
    stop_words='english',
)
X_train = tfidf_vectorizer.fit_transform(newsgroups_train.data)
X_test = tfidf_vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Se realiza una busqueda de hiperparametros para el modelo Multinomial mediante GridSearch. Los parametros a probar son alpha y fit_prior con valores variados.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

clf = MultinomialNB()

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation F1 macro score:", grid_search.best_score_)

Best parameters: {'alpha': 0.01, 'fit_prior': False}
Best cross-validation F1 macro score: 0.7653834304154792


Se obtiene un F1 Score de 0.765. Es un resultado significativamente mejor que el de baseline.

Se realiza una busqueda de hiperparametros para el modelo ComplementNB mediante GridSearch. Los parametros a probar son alpha, fit_prior y norm con valores variados.

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import GridSearchCV

clf = ComplementNB()

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0],
    'fit_prior': [True, False],
    'norm': [True, False],
}

grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation F1 macro score:", grid_search.best_score_)

Best parameters: {'alpha': 0.5, 'fit_prior': True, 'norm': False}
Best cross-validation F1 macro score: 0.7700936364259501


Se obtiene un F1 Score de 0.770. Es un resultado levemente mejor al del modelo Multinomial con hiperparametros tuneados y significativamente mejor que el de baseline.

## 3.
Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.

Se utiliza el vectorizer con los hiperparametros encontrados en el punto anterior.

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.85,
    stop_words='english',
)
X_train = tfidf_vectorizer.fit_transform(newsgroups_train.data)
X_test = tfidf_vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Se toman 5 palabras y los terminos incluidos en el vectorizer del punto 2 (Este no incluye todos los terminos del dataset ya que utiliza los parametros min_df y max_df que limitan la cantidad de documentos de los cuales se toman las palabras.)

In [None]:
X_train_transposed = X_train.T

words = ['god', 'gun', 'moon', 'car', 'baseball']
terms = list(tfidf_vectorizer.get_feature_names_out())

In [None]:
import numpy as np

for word in words:
    word_index = terms.index(word)
    cossim = cosine_similarity(X_train_transposed[word_index], X_train_transposed)[0]
    most_similar = np.argsort(cossim)[::-1][1:6]

    print(f"\nMost similar words to '{word}':")
    for i in most_similar:
        print(terms[i])



Most similar words to 'god':
god god
jesus
god does
christ
bible

Most similar words to 'gun':
gun control
guns
gun owners
pro gun
using gun

Most similar words to 'moon':
lunar
phases
moon landing
orbit moon
moon mars

Most similar words to 'car':
car car
new car
bought car
car like
car accident

Most similar words to 'baseball':
baseball games
baseball game
league baseball
tommorrow
earl weaver




*   **God**: Todas las palabras son relacionadas a la religión
*   **Gun**: Todas las palabras son relacionadas a las armas
*   **Moon**: Parecieran ser 5 terminos relacionados a la exploración espacial. "Phases" podrian ser las fases de un cohete espacial.
*   **Car**: Todas las palabras son relacionadas a los autos.
*   **Baseball**: 4 de las 5 palabras parecen acertadas. "Tomorrow" podria tratarse de una palabra utilizada en anuncios de partidos de baseball o deportes en general. "Earl Weaver" era una figura reconocida en el mundo del baseball.

