In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

## Consigna 1

Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

### Carga de datos y vectorización

In [2]:
# Carga del dataset 20newsgroups.
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Vectorización con TF-IDF del set de datos de entrenamiento.
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data) # Matriz documento-término
y_train = newsgroups_train.target

docs_count = X_train.shape[0]
print(f'El corpus consta de {docs_count} documentos\n')

print(f'Las clases en las que los documentos son clasificados son\n{newsgroups_test.target_names}\n')

# Porcentaje de documentos para cada clase.
for t in np.unique(newsgroups_test.target):
    print(f'El {np.count_nonzero(newsgroups_test.target == t) / docs_count:.2%} de los datos pertenece a la clase {newsgroups_test.target_names[t]}')

print('\nAlgunos de los términos del vectorizador:')
for term in np.sort(list(tfidfvect.vocabulary_.keys()))[20000:20020]:
    print(term)

El corpus consta de 11314 documentos

Las clases en las que los documentos son clasificados son
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

El 2.82% de los datos pertenece a la clase alt.atheism
El 3.44% de los datos pertenece a la clase comp.graphics
El 3.48% de los datos pertenece a la clase comp.os.ms-windows.misc
El 3.46% de los datos pertenece a la clase comp.sys.ibm.pc.hardware
El 3.40% de los datos pertenece a la clase comp.sys.mac.hardware
El 3.49% de los datos pertenece a la clase comp.windows.x
El 3.45% de los datos pertenece a la clase misc.forsale
El 3.50% de los datos pertenece a la clase rec.autos
El 3.52% de los datos pertenec

In [3]:

for t in np.unique(newsgroups_test.target):
    print(np.count_nonzero(newsgroups_test.target == t))

319
389
394
392
385
395
390
396
398
397
399
396
393
396
394
398
364
376
310
251


### Vectorizar 5 documentos y estudiar similitud

In [4]:
# Seleccionar al azar los índices de 5 documentos procedentes del set de entrenamiento.
# Para cada uno de ellos, obtener los índices de los 5 documentos más similares y almacenar
# en un diccionario.
indices = np.random.randint(0, X_train.shape[0] - 1, size=5)
doc_to_similars = {}
for i in indices:
    cos_sim = cosine_similarity(X_train[i], X_train)[0]
    most_sim = np.argsort(cos_sim)[::-1][1:6]
    doc_to_similars[i] = most_sim
    print(f'Los 5 documentos más similares al documento en índice {i} son: {most_sim}')

Los 5 documentos más similares al documento en índice 4558 son: [ 5217  1033   191 11251  8943]
Los 5 documentos más similares al documento en índice 5276 son: [2900 9623 4663 8224 7367]
Los 5 documentos más similares al documento en índice 7155 son: [10836  6894  5856   913  6272]
Los 5 documentos más similares al documento en índice 4912 son: [10370  8991  8691  2475  1275]
Los 5 documentos más similares al documento en índice 10260 son: [  887 10970  8107  6558  5160]


#### Analizar documento 1

In [5]:
# El documento 1 es:
doc_idx = 0
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')


 +----------------------------------------------------------------------------+
 | Kevin Marshall, Operational Support, Motorola ECID, Swindon, UK.           |
 | E-mail   : marshalk@zeus                                                   |
 | Phone    : +44 793 545127 (International)    (0793) 545127 (Domestic)      |
 +----------------------------------------------------------------------------+


Su target es comp.os.ms-windows.misc


In [6]:
# Los 5 más similares al documento 1 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

I tried to E-mail you, but the message bounced.

Motorola has a University Support Program through which (I've been told) folks
at schools can get sample quantities of parts.  If you'd like to try this 
route, e-mail me for the address/phone #...I don't wish to post it for all the
world to see.

Su target es sci.electronics


Heavy-duty, commercial, TINY,(6x3x1/2 inch) WATERPROOF, VHF 2 watt, 2 channel,
handheld two-way radio.  MOTOROLA EXPO purchased NEW for Amateur frequencies
146.10/70 & 146.34/94.  Absolute  M I N T  condition!  Never scratched, dropped,
opened, or otherwise "comprosmised"!  Can be re-crystaled for business band.
has PL slot.  
                                           Original Price:

MOTOROLA EXPO VHF 2WATT/2CHAN. HT--------------------$1200.00
(comes with portable charger, antenna, manual, 
NEW Ni-Cad pack, back housing belt clip)
MOTOROLA extra NEW Ni-Cad pack-----------------------$  40.00
MOTOROLA extra VHF rubber-duckie antenna-------------$  12.50
MOTOROLA

#### Analizar documento 2

In [7]:
# El documento 2 es:
doc_idx = 1
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')


Someone else said something similar.  I will not comment on the
value or lack of value of Elias's "proposal".  I just want to say
that it is very distressing that at least two people here are
profoundly ignorant of Nazi racial doctrine.  They were NOT
like Elias's idea, they were more like the opposite.  

Nazis believed in racial purity, not racial assimilation.  An 
instructive example is the Nazi attitude to Gypsies.  According to 
Nazi theoreticians, Gypsies were an Aryan race.  They were persecuted,
and in huge numbers murdered, because most European Gypies were
considered not pure Gypsies but "mongrels" formed from the pure Gypsy 
race and other undesirable races.  This was the key difference between 
the theoretical approach to Jews and Gypsies, by the way.  It is also 
true that towards the end of WWII even the "purist" Gypsies were 
hunted down as the theory was forgotten.

Su target es talk.politics.mideast


In [8]:
# Los 5 más similares al documento 2 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

Only Brendan McKay, or maybe ARF, would come to the rescue of Nazi
racial theory.  Is it distressing Brendan?  The point is that any
eugenic solution to the Jewish Problem as Elias has proposed smacks
of pure Nazism.  The fact that Elias' proposal cast the entire "problem"
as one of the abnormal presence of Israeli society in the Middle East,
and that he buried a slam against U.S. aid to Israel in the midst of
his "even-handed" solution of the Jewish Question, made it obvious what 
he had in mind: disolving the Jewish polity.  That *is* a Nazi doctrine:
rectification of the "abnormal presence" of the Jewish people within a 
larger body politic.  Whether your "solution" involves gas, monetary 
incentives to the poor Jews to marry out, or as Feisal Husseini has 
said, "disolve the Zionist entity by forcing it to engage the normal 
surrounding Arab culture," you are engaged in a Nazi project.

Just as obvious is your statement: "I will not comment on the value
or lack of value of Elias's 

#### Analizar documento 3

In [9]:
# El documento 3 es:
doc_idx = 2
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')




I hope you realize how trivial it is to manufacture these compounds.  Given
about $10k in lab equipment and chemicals (which are commercially available)
and given the knowledge that I have (graduating BS, Ch, 1993) I could 
synthesize enough of these compounds to make a serious dent in the population
of several major US cities.  As also noted, the knowledge is there for
the production of nuclear weapons.  It's not even that restricted.  The
only thing is the expense.  

Now I'm not going around making these things, but it's not 'cause of any
law; I simply don't get any marginal benefit out of killing anyone.  Any
law you enact in this respect is only going to give you the ability to 
add a charge against someone who does make and use said weapons.  In the
case of chemical agents, I seriously doubt that you would even know that
someone had set up a lab until after the weapons had been used.  

Part of the trouble with the chemical-weapons ban treaty between the US
and the USSR is tha

In [10]:
# Los 5 más similares al documento 3 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

Archive-name: atheism/faq
Alt-atheism-archive-name: faq
Last-modified: 5 April 1993
Version: 1.1

                    Alt.Atheism Frequently-Asked Questions

This file contains responses to articles which occur repeatedly in
alt.atheism.  Points covered here are ones which are not covered in the
"Introduction to Atheism"; you are advised to read that article as well
before posting.

These answers are not intended to be exhaustive or definitive. The purpose of
the periodic FAQ postings is not to stifle debate, but to raise its level. If
you have something to say concerning one of these questions and which isn't
covered by the answer given, please feel free to make your point.

Overview of contents:

   "What is the purpose of this newsgroup?"
   "Hitler was an atheist!"
   "The Bible proves it"
   "Pascal's Wager"
   "What is Occam's Razor?"
   "Why it's good to believe in Jesus"
   "Why I know that God exists"
   "Einstein and "God does not play dice""
   "Everyone worships something"


#### Analizar documento 4

In [11]:
# El documento 4 es:
doc_idx = 3
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')

Here it is

Zoom 14.4k  FAX/DATA v.32bis modem.  I have evreything only purchased in
January.  Will happily provide the Fax/Comm. software and BOX and manuals.
I am selling this for ONLY $125+s/h COD.

Nicolas Nowinski
703-435-9590 FEEL FREE TO CALL for quickest service.

Su target es misc.forsale


In [12]:
# Los 5 más similares al documento 4 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

I would like to know about the current fax software available for
Windows.  Does it take a 9600 baud fax/modem or 14.4K ?  Please
respond with info.

Thank you very much

Su target es comp.os.ms-windows.misc


I've recently ordered a Centris 650 and need to decide on which modem to
buy.  I'm pretty sure I want to get a fax/data modem that can run at 14.4k,
but is it worth it?  I'll primarily only be 'conversing' over CompuServe or
some other link to the Internet, but I'm not sure if those systems can
supply ME with data at 14.4k.  Another question I have is in some of the
modem lingo out there.  I understand baud rates, but what does V3.4 and
V3.4bis mean?  I could really use some suggestions as to what a good modem
for around $300 would be, and why it would be a good choice.

Thanks for your time.

Dave Bell
dbell@coral.bucknell.edu

Su target es comp.sys.mac.hardware


Finally a fax service to all Internet users in the continental U.S. without
prepayment of any kind. This service is 

#### Analizar documento 5

In [13]:
# El documento 5 es:
doc_idx = 4
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')

I'm attempting to transfer files from my home computer running
Windows 3.1 Terminal to a workstation at school.  The file transfer protocol
at home is Kermit for binary files.  I'm running Kermit on the workstation at school and
setting the file transfer protocol to binary.  I am unable to upload files
to school but can download files from school to home.  During download,
Terminal displays ther retrying message several times then the message '
Verify you're using the correct protocol'.  
	Anyone have any ideas on how to fix?  Either e-mail or post to this
group. 

Thanks, in advance,

Su target es comp.os.ms-windows.misc


In [14]:
# Los 5 más similares al documento 5 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')


I'm an new to this.  Having found some files (public) to look into, I
ftp'ed them to a system I have access to.  I then used kermit to transmit
them via modem to my host computer, a PC-based file system.  I access
internet through modem access to a university mainframe.  From the PC
file server, I pull the files to a disk, and then pull them from disk
to a SGI Indigo (the SGI is not networked yet).  When I try to uncompress
and un-tar the files, they either come out as garbage or I get an error
in the tar process about directories being invalid.
What I'm wondering about is the transfer of UNIX files (compressed,
binary,ascii) about multiple platforms.  My guess is that it is the copy
to a 'dos' disk that is screwing things up.  Any help is appreciated.
bob


Su target es comp.windows.x


As a beginer, I just wonder how to transfer files from the Sun 
system (which is on the network) to my PC at home (not connected
to the network). I tried to use 'COMit' to do so, but it was very
slow 

### Comentarios

Se tomaron los 5 documentos más similares a cada uno de 5 documentos elegidos al azar. Se usó como métrica la distancia del coseno y a TF-IDF como técnica de vectorización. Para cada uno de los documentos analizados se calculó el porcentaje de los 5 más similares que poseen el mismo target que el documento analizado. En sucesivas pruebas se observó que a veces dicho porcentaje fue inferior al 50%, lo que sugiere que la técnica de vectorización empleada podría ser mejorada.

## Consigna 2

Entrenar modelos de clasificación Naive Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naive Bayes Multinomial
y ComplementNB.

### Naive Bayes y Complement Naive Bayes usando la vectorización obtenida hasta aquí y parámetros por defecto

In [15]:
# Instanciar y fitear un multinomial Naive Bayes classifier.
mult_nb = MultinomialNB()
mult_nb.fit(X_train, y_train)

# Vectorizar set de datos de testing y realizar predict.
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  mult_nb.predict(X_test)

# Calcular F1-score macro.
print(f'F1 score macro con multinomial Naive Bayes:', f1_score(y_test, y_pred, average='macro'))

# Instanciar y fitear un complement Naive Bayes classifier.
compl_nb = ComplementNB()
compl_nb.fit(X_train, y_train)

# Realizar predict.
y_pred = compl_nb.predict(X_test)

# Calcular F1-score macro.
print(f'F1 score macro con complement Naive Bayes:', f1_score(y_test, y_pred, average='macro'))

F1 score macro con multinomial Naive Bayes: 0.5854345727938506
F1 score macro con complement Naive Bayes: 0.692953349950875


In [24]:
# Volver a instanciar el vectorizador, esta vez usando stop words del inglés.
tfidfvect = TfidfVectorizer(stop_words='english')
X_train = tfidfvect.fit_transform(newsgroups_train.data)

X_test = tfidfvect.transform(newsgroups_test.data)

# Instanciar y fitear un complement Naive Bayes classifier.
compl_nb = ComplementNB()
compl_nb.fit(X_train, y_train)

# Realizar predict.
y_pred = compl_nb.predict(X_test)

print(f'F1 score macro con complement Naive Bayes:', f1_score(y_test, y_pred, average='macro'))

F1 score macro con complement Naive Bayes: 0.6936107849650025
