In [1]:
%pip install numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [3]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [4]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidfvect = TfidfVectorizer()

In [5]:
# en el atributo `data` accedemos al texto
print(newsgroups_train.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [6]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [7]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'Cantidad de documentos: {X_train.shape[0]}')
print(f'Tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
Cantidad de documentos: 11314
Tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [8]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [9]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [10]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [11]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [12]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [13]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [14]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ], shape=(11314,))

In [15]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  6385,  1149, 11238], shape=(11314,))

In [16]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [17]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [18]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [19]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [20]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [21]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

### Consigna del desafío 1

**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

**2**. Construir un modelo de clasificación por prototipos (tipo zero-shot). Clasificar los documentos de un conjunto de test comparando cada uno con todos los de entrenamiento y asignar la clase al label del documento del conjunto de entrenamiento con mayor similaridad.

**3**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

**4**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.


### Consigna 1
**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

In [24]:
from IPython.display import Markdown
import random
# Generar 5 índices aleatorios (de 0 a X_train.shape[0]-1)
idx_documentos = [random.randint(0, X_train.shape[0]-1) for _ in range(5)]
n_carac = 400 # Cantidad de caracteres a mostrar 

# Diccionario para almacenar los índices de documentos similares
idx_documentos_similares = {}
for idx in idx_documentos:
    # Calcular similitud coseno entre el documento idx y todos los documentos
    cossim = cosine_similarity(X_train[idx], X_train)[0]
    # Ordenar índices por similitud (de mayor a menor) y tomar los 5 más similares (excluyendo el propio documento)
    cossim_top5 = np.argsort(cossim)[::-1][1:6]  # Tomar índices 1 al 5
    idx_documentos_similares[idx] = cossim_top5

# Iterar sobre los índices de documentos similares
for idx in idx_documentos_similares.keys():
    # Crear una cadena Markdown para el documento original, mostrando solo los primeros n_carac caracteres
    markdown_output = f"**Documento original (Índice {idx}):**<br>**Categoría:** {newsgroups_train.target_names[newsgroups_train.target[idx]]}<br>{newsgroups_train.data[idx][:n_carac]}...<br><br>"
    markdown_output += "**Top 5 documentos similares:**<br>"
    
    # Agregar cada documento similar como una lista, mostrando solo los primeros n_carac caracteres
    for i, idx_similar in enumerate(idx_documentos_similares[idx], 1):
        markdown_output += f"{i}. **Índice {idx_similar}:** **Categoría:** {newsgroups_train.target_names[newsgroups_train.target[idx_similar]]}<br>{newsgroups_train.data[idx_similar][:n_carac]}...<br>"
    
    # Mostrar la salida con formato Markdown
    display(Markdown(markdown_output))

**Documento original (Índice 5806):**<br>**Categoría:** comp.os.ms-windows.misc<br>Two-part question:

1)  What is Windows NT - a 'real' windows OS?

2)  This past weekend, a local 'hacker' radio show metioned a new product
    from Microsoft called 'Chicago' if I recall.  Anyone know what this is?

That is it -

Thanks a heap.

- Alan
...<br><br>**Top 5 documentos similares:**<br>1. **Índice 820:** **Categoría:** comp.os.ms-windows.misc<br>This is the official Request for Discussion (RFD) for the creation of two
new newsgroups for Microsoft Windows NT.  This is a second RFD, replacing
the one originally posted in January '93 (and never taken to a vote).  The
proposed groups are described below:

NAME: 	 comp.os.ms-windows.nt.setup
STATUS:  Unmoderated.
PURPOSE: Discussions about setting up and installing Windows NT, and about
	 syst...<br>2. **Índice 9356:** **Categoría:** comp.windows.x<br>Hi folks,
]
Does anybody know for a good 32-bit C++/C compiler for OS/2 that supports
OS/2 API and Microsoft windows (maybe Windows NT)?

thanx...<br>3. **Índice 9313:** **Categoría:** comp.os.ms-windows.misc<br>Has anyone used the Number Nine (# 9) Video Graphics adaptor with Windows
or Windows NT?  What do you think???...<br>4. **Índice 8355:** **Categoría:** comp.os.ms-windows.misc<br>WINDOWS NT


I need some information on the new Windows NT.
Anything you have would be appreciated. I know nothing about it.
(Well, except that it exists.)
Some questions... 

Memory requirements, hard drive space, release date? is it out?
How is IBM reacting? Intel?
Can it replace other LAN OS's?

ANYTHING else like specs, speed, etc..

Thanks in advance!

Luke...<br>5. **Índice 6515:** **Categoría:** comp.os.ms-windows.misc<br>hi,
  Have you used Mac system 6.x or 7.x? If the answer is positive, you would
know if ms-windows is a "mature" OS.

  Days ago people doubted that ms-windows is not a real OS. I can see why
they have such question. Ms-windows confuses many people. Microsoft
simulated Mac, but it did a lousy job.  For example:

(1) You can not create hierarchy groups. There is no way to create a group
    in a gr...<br>

**Documento original (Índice 10003):**<br>**Categoría:** alt.atheism<br>
  I answer from the position that we would indeed place these people
  in prison for life.

  That depends not only on their predisposition towards murder, but
  also in their success rate at escape and therefore their ability
  to commit the same crimes again.

  In other words, if lifetime imprisonment doesn't work, perhaps
  it's not because we're not executing these people, but because
  we'r...<br><br>**Top 5 documentos similares:**<br>1. **Índice 8514:** **Categoría:** alt.atheism<br>
  More info please.  I'm not well exposed to these ideas.

/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ 

Bob Beauchaine bobbe@vice.ICO.TEK.COM 

They said that Queens could stay, they blew the Bronx away,
and sank Manhattan out at sea....<br>2. **Índice 3987:** **Categoría:** alt.atheism<br>
  Do I smell .sig material here?


/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ 

Bob Beauchaine bobbe@vice.ICO.TEK.COM 

They said that Queens could stay, they blew the Bronx away,
and sank Manhattan out at sea....<br>3. **Índice 7219:** **Categoría:** alt.atheism<br>
  So now we're judging the Qur'an by what's not in it?  

  How many mutton headed arguments am I going to have to wade
  through today?


  One would hope.

/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ 

Bob Beauchaine bobbe@vice.ICO.TEK.COM 

They said that Queens could stay, they blew the Bronx away,
and sank Manhattan out at sea....<br>4. **Índice 4304:** **Categoría:** alt.atheism<br>
  Bobby:

  Get this the hell out of your .sig until you 1) learn what it
  stands for and 2) really mean it.

/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ 

Bob Beauchaine bobbe@vice.ICO.TEK.COM 

They said that Queens could stay, they blew the Bronx away,
and sank Manhattan out at sea....<br>5. **Índice 10117:** **Categoría:** alt.atheism<br>
  Or, with no dictionary available, they could gain first hand
  knowledge by suffering through one of your posts.


/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ 

Bob Beauchaine bobbe@vice.ICO.TEK.COM 

They said that Queens could stay, they blew the Bronx away,
and sank Manhattan out at sea....<br>

**Documento original (Índice 1655):**<br>**Categoría:** rec.sport.hockey<br>Did you ever notice how many people on the net have trouble in the
comparitively easy task of spelling the nick name of our fair city?  I
never knew that Philadelphia becomes Phillie or Philli when spoken of.  So
for all you who don't know yet here's a _little_ clue.

	IT IS SPELLED:       P H I L L Y

OK...thank you.

Oh yeah, about that drug-induced trade rumor....I don't think the Sniders
are t...<br><br>**Top 5 documentos similares:**<br>1. **Índice 4088:** **Categoría:** sci.space<br>I'm not very impressed by the old so-called "prospecting" work from
LPI, it has almost all been geared towards industrially silly processes on
the moon as an excuse to put astronauts there.   [...]

Translation:  It doesn't support the Nick Szabo Vision of the Future
to Which You MUST Subscribe...

Fred, we're all supporting what each of us thinks should be done, to some
degree.  If you have a pro...<br>2. **Índice 11155:** **Categoría:** talk.politics.mideast<br>

  Oh....I see...I didn't realize this...

  I think that perhaps you should print flyers on this topic, and your
  reasons for thinking the way you do. You should then distribute them
  amongst the world's population. You see, I don't think there are many
  people who are aware of this fact. Thank you for telling us the truth.

  BTW: I would start by sending your flyers to each of the UN offici...<br>3. **Índice 6635:** **Categoría:** talk.politics.misc<br>THE WHITE HOUSE

                    Office of the Press Secretary
______________________________________________________________
For Immediate Release                             April 15, 1993     

	     
                       REMARKS BY THE PRESIDENT
                   TO LAW ENFORCEMENT ORGANIZATIONS
	     
	     
                           The Rose Garden 


2:52 P.M. EDT


	     THE PRESID...<br>4. **Índice 3596:** **Categoría:** talk.politics.misc<br>THE WHITE HOUSE

                    Office of the Press Secretary
_________________________________________________________________
For Immediate Release                             April 14, 1993     

	     
                       REMARKS BY THE PRESIDENT
                      AT SUMMER JOBS CONFERENCE

	     	  
                            Hyatt Regency
                        Crystal City, Vi...<br>5. **Índice 8726:** **Categoría:** talk.politics.mideast<br>

[After a small refresh Hasan got on the track again.]




   |>    |> I get the impression Hasan realized he goofed and is now
   |>    |> trying to drop the thread. Let him. It might save some
   |>    |> miniscule portion of his sorry face.

   |>    Not really. since i am a logical person who likes furthering himself
   |>    from any "name calling", i started trashing any article that contai...<br>

**Documento original (Índice 5927):**<br>**Categoría:** comp.sys.ibm.pc.hardware<br>
...<br><br>**Top 5 documentos similares:**<br>1. **Índice 11313:** **Categoría:** rec.motorcycles<br>Stolen from Pasadena between 4:30 and 6:30 pm on 4/15.

Blue and white Honda CBR900RR california plate KG CBR.   Serial number
JH2SC281XPM100187, engine number 2101240.

No turn signals or mirrors, lights taped over for track riders session
at Willow Springs tomorrow.  Guess I'll miss it.  :-(((

Help me find my baby!!!...<br>2. **Índice 11312:** **Categoría:** comp.graphics<br>
Wouldn't this require a hyper-sphere.  In 3-space, 4 points over specifies
a sphere as far as I can see.  Unless that is you can prove that a point
exists in 3-space that is equi-distant from the 4 points, and this may not
necessarily happen.

Correct me if I'm wrong (which I quite possibly am!)

steve
---


...<br>3. **Índice 11311:** **Categoría:** comp.sys.ibm.pc.hardware<br>I just installed a DX2-66 CPU in a clone motherboard, and tried mounting a CPU 
cooler on the chip.  After about 1/2 hour, the weight of the cooler was enough 
to dislodge the CPU from its mount.  It ended up bending a few pins
on the CPU, but luckily the power was not on yet.  I ended up
pressing the CPU deeply into its socket and then putting the CPU
cooler back on.  So far so good.

Have others...<br>4. **Índice 11310:** **Categoría:** comp.sys.mac.hardware<br>I have a (very old) Mac 512k and a Mac Plus, both of which 
have the same problem.

Their screens blank out, sometimes after a minor physical jolt
(such as inserting a floppy in the internal drive), sometimes 
all by themselves (computer left to itself just goes blank).

I have replaced the wires connecting the logic boards and the 
video board, because it seemed at first that jiggling the wires
m...<br>5. **Índice 11309:** **Categoría:** sci.med<br>DN> From: nyeda@cnsvax.uwec.edu (David Nye)
DN> A neurology
DN> consultation is cheaper than a scan.

And also better, because a neurologist can make a differential
diagnosis between migraine, tension-type headache, cluster, benign
intracranial hypertension, chronic paroxysmal hemicrania, and other
headache syndromes that all appear normal on a scan.  A neurologist
can also recommend a course of t...<br>

**Documento original (Índice 1488):**<br>**Categoría:** sci.electronics<br>
 >
 >: Thus, a deciBell (deci-, l., tenth of + Bell) is a fractional part of the
 >: original Bell.  For example, SouthWestern Bell is a deciBell.
 >
 >Out of what hat did you pull this one?  dB is a ratio not an RBOC!
 >
 >: And the measure of current, Amp, is actually named after both the AMP company
 >: and the Amphenol company.  Both companies revolutionized electronics by
 >: simulatenously ...<br><br>**Top 5 documentos similares:**<br>1. **Índice 2215:** **Categoría:** sci.electronics<br>: 
: >Similarly, people usually use dB for dBm. Another common mistake is spelling
: >``db'' instead of ``dB'' as you did in your article. See the ``B'' is for 
: >``Bell'' company, the mother of AT&T and should be capitalized.
: 
: Thus, a deciBell (deci-, l., tenth of + Bell) is a fractional part of the 
: original Bell.  For example, SouthWestern Bell is a deciBell.

Out of what hat did you pul...<br>2. **Índice 2996:** **Categoría:** sci.electronics<br>

Thus, a deciBell (deci-, l., tenth of + Bell) is a fractional part of the 
original Bell.  For example, SouthWestern Bell is a deciBell.

And the measure of current, Amp, is actually named after both the AMP company
and the Amphenol company.  Both companies revolutionized electronics by
simulatenously realizing that the performance of connectors and sockets 
were affected by the amount of curren...<br>3. **Índice 1407:** **Categoría:** sci.electronics<br>: > 
: > : And the measure of current, Amp, is actually named after both the AMP company
: > : and the Amphenol company.  Both companies revolutionized electronics by
: > : simulatenously realizing that the performance of connectors and sockets 
: > : were affected by the amount of current running through the wires.
: > 
: > Sorry.  The unit for current is the AMPERE which is the name of a french-...<br>4. **Índice 4110:** **Categoría:** sci.electronics<br>}Out of what hat did you pull this one?  dB is a ratio not an RBOC!        
} [...]
}Sorry.  The unit for current is the AMPERE which is the name of a french-man
}named AMPERE who studied electrical current.  The term AMP is just an abbreviation
}of it.  The company AMP came after the AMPERE unit was already in use.
} [...]
}I don't know about this one, but it doesn't sound right.
} [...]
}Well yo...<br>5. **Índice 9780:** **Categoría:** sci.electronics<br>
<stuff deleted>


Good gravy! Decibels are all *ratios.* The question that remains in
any ratio is the reference unit used. Sometimes, this will be a reference
power, such as 1 milliwatt (given a certain circuit impedance which
should also be included in the fine print or known, like 50 ohms
in an RF circuit of that impedance), leading to an accepted
notation of dBm. Maybe it might be dBV, disreg...<br>

### Consigna 2
**2**. Construir un modelo de clasificación por prototipos (tipo zero-shot). Clasificar los documentos de un conjunto de test comparando cada uno con todos los de entrenamiento y asignar la clase al label del documento del conjunto de entrenamiento con mayor similaridad.

In [36]:
# Generamos matriz de similaridades coseno entre test y train
cossim_matrix = cosine_similarity(X_test, X_train)

# Chequeamos que las dimensiones sean correctas
print(X_test.shape)
print(X_train.shape)
print(cossim.shape)


(7532, 101631)
(11314, 101631)
(7532, 11314)


In [37]:
# Encontrar el índice del documento más similar en X_train para cada documento en X_test
indices_mas_similares = np.argmax(cossim_matrix, axis=1) 

# Almacenar predicciones
predicciones = {}

for idx_test in range(X_test.shape[0]):
    idx_train_prediccion = indices_mas_similares[idx_test]
    predicciones[idx_test] = {
        'indice_train_mas_similar': idx_train_prediccion,
        'prediccion': newsgroups_train.target_names[newsgroups_train.target[idx_train_prediccion]]
    }


In [42]:
# Seleccionar 10 índices aleatorios de X_test para mostrar en Markdown
idx_documentos = random.sample(range(X_test.shape[0]), 10)

# Iterar sobre los índices seleccionados
for idx_test in idx_documentos:
    idx_train = indices_mas_similares[idx_test]
    
    # Crear una cadena Markdown para el documento de X_test y su documento más similar de X_train
    markdown_output = f"**Documento de X_test (Índice {idx_test}):**<br>"
    markdown_output += f"**Categoría real:** {newsgroups_test.target_names[newsgroups_test.target[idx_test]]}<br>"
    markdown_output += f"**Categoría predicha:** {newsgroups_train.target_names[newsgroups_train.target[idx_train]]}<br>"
    
    # Mostrar la salida con formato Markdown
    display(Markdown(markdown_output))

**Documento de X_test (Índice 404):**<br>**Categoría real:** comp.sys.ibm.pc.hardware<br>**Categoría predicha:** comp.sys.ibm.pc.hardware<br>

**Documento de X_test (Índice 5017):**<br>**Categoría real:** talk.politics.mideast<br>**Categoría predicha:** talk.politics.mideast<br>

**Documento de X_test (Índice 1392):**<br>**Categoría real:** rec.motorcycles<br>**Categoría predicha:** rec.autos<br>

**Documento de X_test (Índice 1125):**<br>**Categoría real:** sci.med<br>**Categoría predicha:** sci.med<br>

**Documento de X_test (Índice 2626):**<br>**Categoría real:** comp.graphics<br>**Categoría predicha:** comp.os.ms-windows.misc<br>

**Documento de X_test (Índice 5532):**<br>**Categoría real:** comp.sys.ibm.pc.hardware<br>**Categoría predicha:** sci.space<br>

**Documento de X_test (Índice 2731):**<br>**Categoría real:** soc.religion.christian<br>**Categoría predicha:** soc.religion.christian<br>

**Documento de X_test (Índice 3334):**<br>**Categoría real:** rec.sport.hockey<br>**Categoría predicha:** rec.sport.hockey<br>

**Documento de X_test (Índice 6582):**<br>**Categoría real:** comp.windows.x<br>**Categoría predicha:** comp.os.ms-windows.misc<br>

**Documento de X_test (Índice 7442):**<br>**Categoría real:** sci.med<br>**Categoría predicha:** alt.atheism<br>

### Consigna 3

Realizamos una optimización de hiperparámetros con Optuna. Para evitar realizar un overfitting al conjunto de testeo, la optimización se realiza sobre un conjunto de validación generado a partir del mismo conjunto de entrenamiento X_train, mediante la técnica K-Fold Cross Validation. Debido a que tenemos distintas 20 clases distintas, realizamos la separación de forma estratificada:

In [25]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Definir la función objetivo para Optuna
def objective(trial):
    # Sugerir hiperparámetros a optimizar
    alpha = trial.suggest_float('alpha', 1e-4, 10.0, log=True)  # Parámetro de suavizado en escala logarítmica
    fit_prior = trial.suggest_categorical('fit_prior', [True, False])  # Si se aprenden las probabilidades previas de las clases
    
    # Inicializar el clasificador con los hiperparámetros sugeridos
    clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    
    # Configurar validación cruzada estratificada con 5 pliegues
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    # Iterar sobre los pliegues de validación cruzada
    for train_idx, val_idx in skf.split(X_train, y_train):
        # Dividir los datos en entrenamiento y validación
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Entrenar el modelo
        clf.fit(X_tr, y_tr)
        # Predecir y evaluar en el conjunto de validación
        y_pred = clf.predict(X_val)
        score = f1_score(y_val, y_pred, average='macro')
        scores.append(score)
    
    # Devolver la precisión media de los pliegues
    return np.mean(scores)

# Crear un estudio de Optuna para maximizar la precisión
study = optuna.create_study(direction='maximize')
# Ejecutar la optimización con un máximo de 30 pruebas
study.optimize(objective, n_trials=30)

# Imprimir los mejores hiperparámetros y la mejor precisión
print("Mejores hiperparámetros: ", study.best_params)
print("Mejor F1-score de validación: ", study.best_value)

# Entrenar el modelo final con los mejores hiperparámetros en todo el conjunto de entrenamiento
best_clf = MultinomialNB(
    alpha=study.best_params['alpha'],
    fit_prior=study.best_params['fit_prior']
)
best_clf.fit(X_train, y_train)

[I 2025-09-07 11:16:26,254] A new study created in memory with name: no-name-e930a137-d2fa-4c12-9497-46b653a7285f
[I 2025-09-07 11:16:26,569] Trial 0 finished with value: 0.745740912560087 and parameters: {'alpha': 0.029930550960227904, 'fit_prior': True}. Best is trial 0 with value: 0.745740912560087.
[I 2025-09-07 11:16:26,884] Trial 1 finished with value: 0.7545688436531444 and parameters: {'alpha': 0.004260584231418462, 'fit_prior': True}. Best is trial 1 with value: 0.7545688436531444.
[I 2025-09-07 11:16:27,206] Trial 2 finished with value: 0.7484418581040952 and parameters: {'alpha': 0.0005496016965175176, 'fit_prior': True}. Best is trial 1 with value: 0.7545688436531444.
[I 2025-09-07 11:16:27,515] Trial 3 finished with value: 0.5431709365807051 and parameters: {'alpha': 2.9085279717835943, 'fit_prior': True}. Best is trial 1 with value: 0.7545688436531444.
[I 2025-09-07 11:16:27,842] Trial 4 finished with value: 0.7327524806968804 and parameters: {'alpha': 0.05749239408154907

Mejores hiperparámetros:  {'alpha': 0.01129245740911025, 'fit_prior': False}
Mejor F1-score de validación:  0.7593239644835229


In [26]:
y_pred_optimizado = best_clf.predict(X_test)
f1_score(y_test, y_pred_optimizado, average='macro')

0.6894692817054662

In [27]:
# Definir la función objetivo para Optuna
def objective_complementnb(trial):
    # Sugerir hiperparámetros a optimizar
    alpha = trial.suggest_float('alpha', 1e-4, 10.0, log=True)  # Parámetro de suavizado en escala logarítmica
    fit_prior = trial.suggest_categorical('fit_prior', [True, False])  # Si se aprenden las probabilidades previas de las clases
    
    # Inicializar el clasificador con los hiperparámetros sugeridos
    clf = ComplementNB(alpha=alpha, fit_prior=fit_prior)
    
    # Configurar validación cruzada estratificada con 5 pliegues
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    # Iterar sobre los pliegues de validación cruzada
    for train_idx, val_idx in skf.split(X_train, y_train):
        # Dividir los datos en entrenamiento y validación
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Entrenar el modelo
        clf.fit(X_tr, y_tr)
        # Predecir y evaluar en el conjunto de validación
        y_pred = clf.predict(X_val)
        score = f1_score(y_val, y_pred, average='macro')
        scores.append(score)
    
    # Devolver la precisión media de los pliegues
    return np.mean(scores)

# Crear un estudio de Optuna para maximizar la precisión
study_complementnb = optuna.create_study(direction='maximize')
# Ejecutar la optimización con un máximo de 30 pruebas
study_complementnb.optimize(objective_complementnb, n_trials=30)

# Imprimir los mejores hiperparámetros y la mejor precisión
print("Mejores hiperparámetros: ", study_complementnb.best_params)
print("Mejor F1-score de validación: ", study_complementnb.best_value)

# Entrenar el modelo final con los mejores hiperparámetros en todo el conjunto de entrenamiento
best_clf_complementnb = ComplementNB(
    alpha=study_complementnb.best_params['alpha'],
    fit_prior=study_complementnb.best_params['fit_prior']
)
best_clf_complementnb.fit(X_train, y_train)

[I 2025-09-07 11:16:36,294] A new study created in memory with name: no-name-ae2f6e0b-57ac-4251-adda-30a3e454eded
[I 2025-09-07 11:16:36,630] Trial 0 finished with value: 0.7193585060642885 and parameters: {'alpha': 0.0012858633855637605, 'fit_prior': False}. Best is trial 0 with value: 0.7193585060642885.
[I 2025-09-07 11:16:36,962] Trial 1 finished with value: 0.7601018835900443 and parameters: {'alpha': 0.3303165817702721, 'fit_prior': False}. Best is trial 1 with value: 0.7601018835900443.
[I 2025-09-07 11:16:37,288] Trial 2 finished with value: 0.7558925411703783 and parameters: {'alpha': 0.03785651242488598, 'fit_prior': True}. Best is trial 1 with value: 0.7601018835900443.
[I 2025-09-07 11:16:37,618] Trial 3 finished with value: 0.7379050140293708 and parameters: {'alpha': 0.008142932694501538, 'fit_prior': False}. Best is trial 1 with value: 0.7601018835900443.
[I 2025-09-07 11:16:37,948] Trial 4 finished with value: 0.7046351145273082 and parameters: {'alpha': 0.0002882844350

Mejores hiperparámetros:  {'alpha': 0.1962092602879557, 'fit_prior': False}
Mejor F1-score de validación:  0.7643122055499413


In [28]:
y_pred_complementnb = best_clf_complementnb.predict(X_test)
f1_score(y_test, y_pred_complementnb, average='macro')

0.6999155081250106