# Carlos Méndez

### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score
import random

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [3]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [4]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [5]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [6]:
print(len(newsgroups_train.data))
print(len(newsgroups_test.data))

11314
7532


In [7]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [8]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'cantidad de documentos: {X_train.shape[0]}')
print(f'tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
cantidad de documentos: 11314
tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [9]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [10]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [11]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [12]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [13]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [14]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [15]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [16]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [17]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [18]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [19]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [20]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [21]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [22]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

# Consigna del desafío 1

**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

**2**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

**3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.


# Función Auxiliar

In [23]:
def mostrar_similaridad(most_similar_indices):

  """Muestra los documentos similares junto con su categoría y texto."""
  for sim_index in most_similar_indices:
      doc_text = newsgroups_train.data[sim_index]
      category = newsgroups_train.target_names[y_train[sim_index]]

      print(f'\nDocumento seleccionado: {sim_index}')
      print(f'Categoría: {category}')
      print(f'Texto: {doc_text[:500]}...')  # Muestra solo los primeros 500 caracteres
      print("="*100)


def general_info(indice, most_similar_indices):

  print(f'La Categoría asociada al documento original es: {newsgroups_train.target_names[y_train[indice]]}')
  print(f'Los 5 documentos con mayor similitud al documento {indice} son: {most_similar_indices}')

  print("="*45, "Doc",indice,"="*45)
  print(newsgroups_train.data[indice])
  print("="*100)


# Selección de los 5 documentos al azar

In [24]:
random.seed(49)
random_indices = random.sample(range(len(newsgroups_train.data)), 5)
selected_docs = [newsgroups_train.data[i] for i in random_indices]
selected_targets = [newsgroups_train.target[i] for i in random_indices]

In [25]:
print("Los documentos seleccionados fueron:",random_indices)

Los documentos seleccionados fueron: [1095, 5641, 6770, 1810, 5300]


# Análisis para el documento 1095

In [26]:
indice_1 = random_indices[0]
# Calcula la similaridad coseno entre el indice_1 y todo el resto de los documentos
cos_sim = cosine_similarity(X_train[indice_1], X_train)[0]

# Devuelve las mejores 5 similaridades
most_similar_indices = np.argsort(cos_sim)[::-1][1:6]



In [27]:
general_info(indice_1, most_similar_indices)

La Categoría asociada al documento original es: talk.politics.guns
Los 5 documentos con mayor similitud al documento 1095 son: [1292 5229 6643 6894 6437]


       As perhaps some insight into how this sort of thing works, the
local college newspaper had a big crusade to have the U.T. police
release crime stats.  (The school claimed that to do so would violate
federal education records privacy laws).  They swore up and down they
weren't interested in student discipline records, only for stats so people
could make an evaluation of how safe the campus was.

       It was barely a week after crime stats were released before the
Daily Beacon had an editorial calling for student disciplinary stats
to be released, because they complained certain segments of the campus
population were treated administratively rather than turned over to the
police and therefore the criminal states weren't accurate.

       What people say they want public today may not be what they
say tomorrow.



In [28]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 1292
Categoría: talk.politics.mideast
Texto: Accounts of Anti-Armenian Human Right Violations in Azerbaijan #008 Part B
                 Prelude to Current Events in Nagorno-Karabakh

				(Part B of #008)

      +------------------------------------------------------------------+
      |                                                                  |
      | "Oh, yes, I just remembered. While they were raping me they      |
      |  repeated quite frequently, "Let the Armenian women have babies  |
      |  for us, Muslim babies, let the...

Documento seleccionado: 5229
Categoría: rec.sport.baseball
Texto: Hello, my friends and I are running the Homewood Fantasy Baseball
League (pure fantasy baseball teams). Unfortunely, we are running the league
using Earl Weaver Baseball II with the Comm. Disk II and we need the stats
for the 1992 season. (Preferably the 1992 Major League Stat Disk) We have
the '92 total stats but EWB2 needs the split stats otherwise we hav

Se puede evidenciar que para el documento analizado (1095) los documentos más similares tratan principalmente de temas de política, armas y violaciones de derechos. Sin embargo sorprendentemente aparece unos relacionados a deportes, esto debe ser consecuncia de que en estos textos se mencionan recurrentemente la palabra **"stats"** (**estadísticas**).

# Análisis para el documento 5641

In [29]:
indice_2 = random_indices[1]
# Calcula la similaridad coseno entre el indice_1 y todo el resto de los documentos
cos_sim = cosine_similarity(X_train[indice_2], X_train)[0]

# Devuelve las mejores 5 similaridades
most_similar_indices = np.argsort(cos_sim)[::-1][1:6]

In [30]:
general_info(indice_2, most_similar_indices)

La Categoría asociada al documento original es: alt.atheism
Los 5 documentos con mayor similitud al documento 5641 son: [10719  1308  3438  4339  3289]

Ahhh go back to alt.autotheism where you belong!


In [31]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 10719
Categoría: sci.electronics
Texto: ...

Another April 1 posting.  Ahhh....

Documento seleccionado: 1308
Categoría: comp.os.ms-windows.misc
Texto: 
Do you mean the icons _of_ the program groups, or the icons of the
individual programs _in_ the program groups?  I assume you mean the
latter, and the answer is: sure you can.  Just click once (not double)
on the application icon, then Alt-F P (File | Properties).  Click on the
Change Icon box and tell it the icon filename.  Or use the Browse
sub-selection.


I use Alt-Tab.  Hold the Alt key and repeatedly press Tab until you see
Program Mangler up.  Then release the Alt key.

...

Documento seleccionado: 3438
Categoría: alt.atheism
Texto: Archive-name: atheism/overview
Alt-atheism-archive-name: overview
Last-modified: 5 April 1993
Version: 1.2

                                   Overview

Welcome to alt.atheism and alt.atheism.moderated.

This is the first in a series of regular postings aimed at new readers o

El texto seleccionado es bastante corto, por lo que la similaridad parece recoger algunas distorsiones, el documento trata de ateismo que es una expresión de no creencia en Dios; por lo que es natural que muestre alguna simitud con temas religiosos (lo cual ocurrió). Sin embargo tambien trajo similitud con la categoría "windows.misc" dado que el tema original usa la expresión "alt" que es una expresión frecuentemente utilizada cuando se habla de temas relacionados al software de Microsoft.

# Análisis del documento **6770**

In [32]:
indice_3 = random_indices[2]
# Calcula la similaridad coseno entre el indice_1 y todo el resto de los documentos
cos_sim = cosine_similarity(X_train[indice_3], X_train)[0]

# Devuelve las mejores 5 similaridades
most_similar_indices = np.argsort(cos_sim)[::-1][1:6]

In [33]:
general_info(indice_3, most_similar_indices)

La Categoría asociada al documento original es: sci.crypt
Los 5 documentos con mayor similitud al documento 6770 son: [7659 2314 6894 5856  913]









  A remark I heard the other day is beginning to take on increasingly
frightening significance.  The comment was made that "In other parts
of the world the Democrats [note the big "D"] would be known as
Socialists" 

  A [note the small "d"] democrat who wonders what Thomas Jefferson, on
this the 250th anniversary of his birth, would have thought of the state
of affairs between the government and the governed.


------- Any views expressed are those of myself and not my employer. --------
Steven C. Johnson, WB3IRU / VK2GDS      |
TRW                                     | johnson@trwacs.fp.trw.com
FP1 / 3133                              |         [129.193.172.90]
1 Federal Systems Park Drive            | Phone:        +1 (703) 968.1000
Fairfax, Virginia  22033-4412  U.S.A.   | Fax:          +1 (703) 803.5189


In [34]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 7659
Categoría: sci.crypt
Texto: 
[...]

  Who makes them forget and destroy all copies of the key once they've
decided you're not a criminal today?  Just curious.
------- Any views expressed are those of myself and not my employer. --------
Steven C. Johnson, WB3IRU / VK2GDS      |
TRW                                     | johnson@trwacs.fp.trw.com
FP1 / 3133                              |         [129.193.172.90]
1 Federal Systems Park Drive            | Phone:        +1 (703) 968.1000
Fairfax, Virginia  22033-4412  U.S.A.   ...

Documento seleccionado: 2314
Categoría: comp.windows.x
Texto: A few days ago there was a posting in this group by Andrea Winkler
titled "X and Security / X Technical Conference".  I was one of the
instructors of that tutorial.  Unfortunately, my system purged
the message before I had a chance to see it, and I don't have
Andrea's email address.  If someone has Andrea's address and/or
the posting, I would really appreciate it if you'd 

In [35]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 7659
Categoría: sci.crypt
Texto: 
[...]

  Who makes them forget and destroy all copies of the key once they've
decided you're not a criminal today?  Just curious.
------- Any views expressed are those of myself and not my employer. --------
Steven C. Johnson, WB3IRU / VK2GDS      |
TRW                                     | johnson@trwacs.fp.trw.com
FP1 / 3133                              |         [129.193.172.90]
1 Federal Systems Park Drive            | Phone:        +1 (703) 968.1000
Fairfax, Virginia  22033-4412  U.S.A.   ...

Documento seleccionado: 2314
Categoría: comp.windows.x
Texto: A few days ago there was a posting in this group by Andrea Winkler
titled "X and Security / X Technical Conference".  I was one of the
instructors of that tutorial.  Unfortunately, my system purged
the message before I had a chance to see it, and I don't have
Andrea's email address.  If someone has Andrea's address and/or
the posting, I would really appreciate it if you'd 

El texto en análisis **6770**, retorna los documentos con similitudes relevantes. En primera instancia está devolviendo documentos con una estructura muy similar (Texto de un correo).

# Análisis del documento **1810**

In [36]:
indice_4 = random_indices[3]
# Calcula la similaridad coseno entre el indice_1 y todo el resto de los documentos
cos_sim = cosine_similarity(X_train[indice_4], X_train)[0]

# Devuelve las mejores 5 similaridades
most_similar_indices = np.argsort(cos_sim)[::-1][1:6]

In [37]:
general_info(indice_4, most_similar_indices)

La Categoría asociada al documento original es: alt.atheism
Los 5 documentos con mayor similitud al documento 1810 son: [ 2509 10106 10229  3754  5820]

So what you're saying is that your mind is made up, and you'll just explain
away any differences at being statistically insignificant?


So you'll just explain away any inconsistancies in your "theory" as being
"a special case".


You just equated them.  Re-read your own words.


A study release in 1991 found that 11% of female seagulls are lesbians.


Now, apply this last sentence of your to YOUR theory.  Notice how your are
contridicting observations?


You don't know much math, do you?  The ability to use SAS to determine the
length of the third side of the triangle is fundemental to geometry.


Goals <> postulates.

Again, if one of the "goals" of this "objective/natural morality" system
you are proposing is "survival of the species", then homosexuality is
immoral.


In [38]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 2509
Categoría: alt.atheism
Texto: 

Well, I've provided examples to show that the trend was general, and you
(or others) have provided some counterexamples, mostly ones surrounding
mating practices, etc.  I don't think that these few cases are enough to
disprove the general trend of natural morality.  And, again, the mating
practices need to be reexamined...


No, but mating practices are a special case.  I'll have to think about it
some more.


Indeed.  But, while the natural system is objective, all objective systems
are not t...

Documento seleccionado: 10106
Categoría: soc.religion.christian
Texto: [In looking through my files this weekend, I ran across some lyrics from
various rock groups that have content.  Here are two from Black Sabbath's
"Master of Reality".  I'll say this much for the music of the '60's and early
'70's, at least they asked questions of significance.  Jethro Tull is another
to asked and wrote about things that caused one to wonder. --

El texto seleccionado trata como tópico (categoría) acerca del ateismo por lo que es natural que muestre alguna simitud con temas religiosos (nuevamente). A diferencia del otro texto de ateismo, este documento contenía una más extensa distribución de términos de frecuente uso en discusiones filosóficas.

# Análisis del documento **5300**

In [45]:
indice_5 = random_indices[4]
# Calcula la similaridad coseno entre el indice_1 y todo el resto de los documentos
cos_sim = cosine_similarity(X_train[indice_5], X_train)[0]

# Devuelve las mejores 5 similaridades
most_similar_indices = np.argsort(cos_sim)[::-1][1:6]

In [46]:
general_info(indice_5, most_similar_indices)

La Categoría asociada al documento original es: rec.motorcycles
Los 5 documentos con mayor similitud al documento 5300 son: [9623 1292 7286 3282 5826]
:    I hate to admit this, and I'm still mentally kicking myself for it.
: I rode the brand new K75RT home last Friday night.  100 miles in rain
: and darkness.  No problems.  Got it home and put it on the center stand.
:    The next day I pushed it off the center stand in preparation for going
: over to a friend's house to pose.  You guessed it.  It got away from me
: and landed on its right side.  
:    Scratched the lower fairing, cracked the right mirror, and cracked the
: upper fairing.  
:    *DAMN* am I stupid!  It's going to cost me ~$200 to get the local
: body shop to fix it.  And that is after I take the fairing off for them.
: Still, that's probably cheaper than the mirror alone if I bought a 
: replacement from BMW.

You got off cheap.  My sister's ex-boyfriend was such an incessant pain
in the ass about wanting to ride my b

In [47]:
mostrar_similaridad(most_similar_indices)


Documento seleccionado: 9623
Categoría: talk.politics.mideast
Texto: Accounts of Anti-Armenian Human Right Violations in Azerbaijan #012
                 Prelude to Current Events in Nagorno-Karabakh

        +---------------------------------------------------------+
        |                                                         |
        |  I saw a naked girl with her hair down. They were       |
        |  dragging her. She kept falling because they were       |
        |  pushing her and kicking her. She fell down, it was     |
        |  muddy there, and ...

Documento seleccionado: 1292
Categoría: talk.politics.mideast
Texto: Accounts of Anti-Armenian Human Right Violations in Azerbaijan #008 Part B
                 Prelude to Current Events in Nagorno-Karabakh

				(Part B of #008)

      +------------------------------------------------------------------+
      |                                                                  |
      | "Oh, yes, I just remembered. While th

2. Entrenar modelos de **clasificación Naïve Bayes** para maximizar el desempeño de clasificación (f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial y ComplementNB.

# Modelos sin ajustes de parámetros

In [49]:
# Crear y ajustar el vectorizador
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

In [50]:
# Función para entrenar y evaluar un modelo
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average='macro')

In [51]:
# Entrenar y evaluar MultinomialNB
mnb = MultinomialNB(alpha=0.1)
mnb_f1 = train_and_evaluate(mnb, X_train, newsgroups_train.target, X_test, newsgroups_test.target)
print(f"F1-score para MultinomialNB: {mnb_f1:.4f}")

F1-score para MultinomialNB: 0.6463


In [52]:
# Entrenar y evaluar ComplementNB
cnb = ComplementNB(alpha=0.1)
cnb_f1 = train_and_evaluate(cnb, X_train, newsgroups_train.target, X_test, newsgroups_test.target)
print(f"F1-score para ComplementNB: {cnb_f1:.4f}")

F1-score para ComplementNB: 0.7043


In [53]:
# Comparar los resultados
if mnb_f1 > cnb_f1:
    print("MultinomialNB tiene un mejor desempeño.")
elif cnb_f1 > mnb_f1:
    print("ComplementNB tiene un mejor desempeño.")
else:
    print("Ambos modelos tienen un desempeño similar.")

ComplementNB tiene un mejor desempeño.


# Naive Bayes con ajustes de parámetros

In [54]:
def train_and_evaluate(vectorizer, model, X_train, y_train, X_test, y_test):
    # Vectorizar los datos
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Entrenar el modelo
    model.fit(X_train_vec, y_train)

    # Predecir y calcular el f1-score
    y_pred = model.predict(X_test_vec)
    f1 = f1_score(y_test, y_pred, average='macro')

    return f1

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
# Definir los parámetros para la búsqueda de hiperparámetros
vectorizer_params = {
    'max_df': [0.5, 0.75, 1.0],
    'min_df': [1, 2, 3],
    'ngram_range': [(1, 1), (1, 2)],
}

mnb_params = {
    'alpha': [0.1, 0.5, 1.0],
}

cnb_params = {
    'alpha': [0.1, 0.5, 1.0],
    'norm': [True, False],
}

# Realizar búsqueda de hiperparámetros para MultinomialNB
mnb_grid = GridSearchCV(
    estimator=MultinomialNB(),
    param_grid=mnb_params,
    cv=5,
    scoring='f1_macro'
)

vectorizer_grid = GridSearchCV(
    estimator=TfidfVectorizer(),
    param_grid=vectorizer_params,
    cv=5,
    scoring='f1_macro'
)

In [59]:
# Ajustar el vectorizador y el modelo MNB
vectorizer_grid.fit(newsgroups_train.data, newsgroups_train.target)
best_vectorizer = vectorizer_grid.best_estimator_

X_train_vec = best_vectorizer.transform(newsgroups_train.data)
mnb_grid.fit(X_train_vec, newsgroups_train.target)

# Evaluar el mejor modelo MNB
best_mnb = mnb_grid.best_estimator_
mnb_f1 = train_and_evaluate(best_vectorizer, best_mnb, newsgroups_train.data, newsgroups_train.target, newsgroups_test.data, newsgroups_test.target)

print(f"Mejor F1-score para MultinomialNB: {mnb_f1}")
print(f"Mejores parámetros para MultinomialNB: {mnb_grid.best_params_}")
print(f"Mejores parámetros para el vectorizador: {vectorizer_grid.best_params_}")


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 218, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
AttributeError: 'TfidfVectorizer' object has no attribute 'predict'



Mejor F1-score para MultinomialNB: 0.6619598560030805
Mejores parámetros para MultinomialNB: {'alpha': 0.1}
Mejores parámetros para el vectorizador: {'max_df': 0.5, 'min_df': 1, 'ngram_range': (1, 1)}


In [60]:
# Realizar búsqueda de hiperparámetros para ComplementNB
cnb_grid = GridSearchCV(
    estimator=ComplementNB(),
    param_grid=cnb_params,
    cv=5,
    scoring='f1_macro'
)

# Ajustar el modelo CNB
cnb_grid.fit(X_train_vec, newsgroups_train.target)


In [61]:
# Evaluar el mejor modelo CNB
best_cnb = cnb_grid.best_estimator_
cnb_f1 = train_and_evaluate(best_vectorizer, best_cnb, newsgroups_train.data, newsgroups_train.target, newsgroups_test.data, newsgroups_test.target)

In [63]:
print(f"\nMejor F1-score para ComplementNB: {cnb_f1}")
print(f"Mejores parámetros para ComplementNB: {cnb_grid.best_params_}")
print(f"Mejores parámetros para el vectorizador: {vectorizer_grid.best_params_}")


Mejor F1-score para ComplementNB: 0.6948065380578043
Mejores parámetros para ComplementNB: {'alpha': 0.1, 'norm': False}
Mejores parámetros para el vectorizador: {'max_df': 0.5, 'min_df': 1, 'ngram_range': (1, 1)}


# Vectorizador ajustado a los mejores parámetros

In [65]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 1),
    min_df=1,
    max_df=0.5
)


# Ajuste del vectorizador
X_train_tfidf = vectorizer.fit_transform(newsgroups_train.data)

In [67]:
X_terms_documentos = X_train_tfidf.T
terminos = vectorizer.get_feature_names_out()

print(f"Forma de la matriz TF-IDF: {X_train_tfidf.shape}")
print(f"Número total de términos: {len(terms)}")


Forma de la matriz TF-IDF: (11314, 10000)
Número total de términos: 10000


In [98]:
selection = ['baseball', 'rights', 'laws', 'groups', 'crime']

In [88]:

def similar_words(selection, X_terms_documentos, terminos, top_n=5):

  """
  Encuentra las palabras más similares para una lista de palabras seleccionadas.

  Parámetros:
  - selection: Lista de palabras para las que se buscan similares.
  - X_terms_documentos: Matriz transpuesta de TF-IDF (términos x documentos).
  - terminos: Lista de todos los términos (características).
  - top_n: Número de palabras similares a devolver para cada palabra seleccionada.

  Retorna:
  - Un diccionario con las palabras seleccionadas como claves y listas de tuplas
    (palabra similar, puntuación de similitud) como valores.
  """
  resultados = {}

  for palabra in selection:
      if palabra in terminos:
          indice = np.where(terminos == palabra)[0][0]
          vector_palabra = X_terms_documentos[indice].reshape(1, -1)

          similitudes = cosine_similarity(vector_palabra, X_terms_documentos)[0]
          indices_similares = np.argsort(similitudes)[::-1][1:6]  # Top 5 excluyendo la propia palabra

          palabras_similares = [(terminos[i], similitudes[i]) for i in indices_similares]
          resultados[palabra] = palabras_similares
      else:
          resultados[palabra] = [("Palabra no encontrada en el vocabulario", 0)]

  return resultados

In [99]:
resultados = get_similar_words(selection, X_terms_documentos, terminos)
resultados

{'baseball': [('football', 0.17172640113534215),
  ('espn', 0.16474950555061119),
  ('game', 0.14537464671443387),
  ('weekly', 0.1382889762511507),
  ('basketball', 0.13401270465501627)],
 'rights': [('civil', 0.23214220587139758),
  ('human', 0.19078509940607413),
  ('constitution', 0.18801202117108323),
  ('jointly', 0.16705628676999643),
  ('survivor', 0.15821466496422032)],
 'laws': [('law', 0.19668110287453766),
  ('warned', 0.17096715626021214),
  ('federal', 0.15293140575149972),
  ('ceremonial', 0.14857842912720914),
  ('mw', 0.14008560998427702)],
 'groups': [('desktop', 0.1656075984781788),
  ('aspects', 0.1620945528052851),
  ('convenience', 0.16016025836152328),
  ('icons', 0.15501256499806135),
  ('group', 0.14203145468717335)],
 'crime': [('criminal', 0.264330876324275),
  ('violent', 0.24377710150912194),
  ('gun', 0.23313244350814974),
  ('rate', 0.18610995448559314),
  ('rape', 0.1852416061552619)]}

Se puede observar que el modelo se encuentra performando bien (aún cuando puedan haber aspectos de mejora). Si observamos los diferentes registros para cada una de las palabras seleccionadas se tiene:

Para la palabra **Baseball** arrojo similaritud con las siguientes {football, espn, game, wekly, basketball}. Para este caso, la similiritud esta relacionada con eventos deportivos más que con el deporte en particular.

Para la palabra **Rights** arrojo similaritud con las siguientes {civil, human, constitution, jointly, survivor}. Términos bastante asociados a la palabra "derechos".

Para la palabra **Laws** arrojo similaritud con las siguientes {law, warned, federal, ceremonial, mw}. Palabras con relación asociadas a términos legales y/o bufetes.

Para la palabra **Groups** arrojo similaritud con las siguientes {desktop, aspects, convenience, icons, group}

Para la palabra **Crime** arrojo similaritud con las siguientes {criminal, violent, gun, rate, rape}. Para este casos resultaron con mayor similitud palabras muy relacionadas a hechos delictivos (violentos).

