In [42]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

## Consigna 1

Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.

### Carga de datos y vectorización

In [60]:
# Carga del dataset 20newsgroups.
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Vectorización con TF-IDF del set de datos de entrenamiento.
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data) # Matriz documento-término
y_train = newsgroups_train.target

docs_count = X_train.shape[0]
print(f'El corpus consta de {docs_count} documentos\n')

print(f'Las clases en las que los documentos son clasificados son\n{newsgroups_test.target_names}\n')

# Porcentaje de documentos para cada clase.
for t in np.unique(newsgroups_test.target):
    print(f'El {np.count_nonzero(newsgroups_test.target == t) / docs_count:.2%} de los datos pertenece a la clase {newsgroups_test.target_names[t]}')

print('\nAlgunos de los términos del vectorizador:')
for term in np.sort(list(tfidfvect.vocabulary_.keys()))[20000:20020]:
    print(term)

El corpus consta de 11314 documentos

Las clases en las que los documentos son clasificados son
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

El 2.82% de los datos pertenece a la clase alt.atheism
El 3.44% de los datos pertenece a la clase comp.graphics
El 3.48% de los datos pertenece a la clase comp.os.ms-windows.misc
El 3.46% de los datos pertenece a la clase comp.sys.ibm.pc.hardware
El 3.40% de los datos pertenece a la clase comp.sys.mac.hardware
El 3.49% de los datos pertenece a la clase comp.windows.x
El 3.45% de los datos pertenece a la clase misc.forsale
El 3.50% de los datos pertenece a la clase rec.autos
El 3.52% de los datos pertenec

In [55]:

for t in np.unique(newsgroups_test.target):
    print(np.count_nonzero(newsgroups_test.target == t))

319
389
394
392
385
395
390
396
398
397
399
396
393
396
394
398
364
376
310
251


### Vectorizar 5 documentos y estudiar similitud

In [3]:
# Seleccionar al azar los índices de 5 documentos procedentes del set de entrenamiento.
# Para cada uno de ellos, obtener los índices de los 5 documentos más similares y almacenar
# en un diccionario.
indices = np.random.randint(0, X_train.shape[0] - 1, size=5)
doc_to_similars = {}
for i in indices:
    cos_sim = cosine_similarity(X_train[i], X_train)[0]
    most_sim = np.argsort(cos_sim)[::-1][1:6]
    doc_to_similars[i] = most_sim
    print(f'Los 5 documentos más similares al documento en índice {i} son: {most_sim}')

Los 5 documentos más similares al documento en índice 5795 son: [4352  372 9179  769 7432]
Los 5 documentos más similares al documento en índice 2503 son: [2503 2893 2411 6719 4166]
Los 5 documentos más similares al documento en índice 10283 son: [10999  9477  2873 10432 10682]
Los 5 documentos más similares al documento en índice 899 son: [6291 6662 9793  644 9623]
Los 5 documentos más similares al documento en índice 4558 son: [ 5217  1033   191 11251  8943]


#### Analizar documento 1

In [4]:
# El documento 1 es:
doc_idx = 0
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')

1976 Montreal Olympics philatelic souvenirs: 
 
1. Color-illustrated booklet in French/English containing all stamps 
issued for the Games (mint never hinged) in slipcase, over $6.00 
face value in stamps. $13.00 + $2.00 insured first class mailing 
 
2. Unusual "desk pad holder" with Olympic rings on the cover and the 
Montreal stadium inside. All the Canadian Olympic stamps are 
displayed on the "cover" under heavy plastic. Again, over $6.00
face value. $11.00 + $2.50 insured first class mailing. 

Su target es misc.forsale


In [5]:
# Los 5 más similares al documento 1 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

MLB Standings and Scores for Tuesday, April 6th, 1993
	                 (including yesterday's games)

NATIONAL WEST	      Won  Lost   Pct.    GB   Last 10   Streak    Home   Road
Atlanta Braves         01   00   1.000    --     1-0      Won 1   00-00  01-00
Cincinnati Reds        01   00   1.000    --     1-0      Won 1   01-00  00-00
San Diego Padres       00   00    .000   0.5     0-0       ---    00-00  00-00
San Francisco Giants   00   00    .000   0.5     0-0       ---    00-00  00-00
Colorado Rockies       00   01    .000   1.0     0-1     Lost 1   00-00  00-01
Houston Astros         00   01    .000   1.0     0-1     Lost 1   00-01  00-00
Los Angeles Dodgers    00   01    .000   1.0     0-1     Lost 1   00-00  00-01

NATIONAL EAST
Florida Marlins        01   00   1.000    --     1-0      Won 1   01-00  00-00
New York Mets          01   00   1.000    --     1-0      Won 1   01-00  00-00
Philadelphia Phillies  01   00   1.000    --     1-0      Won 1   00-00  01-00
Pittsburgh Pira

#### Analizar documento 2

In [6]:
# El documento 2 es:
doc_idx = 1
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')

:     Help!! I need code/package/whatever to take 3-D data and turn it into
: a wireframe surface with hidden lines removed. I'm using a DOS machine, and
: the code can be in ANSI C or C++, ANSI Fortran or Basic. The data I'm using
: forms a rectangular grid.
:    Please post your replies to the net so that others may benefit. IMHO, this
: is a general interest question.
:    Thank you!!!!!!


Su target es comp.graphics


In [7]:
# Los 5 más similares al documento 2 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

:     Help!! I need code/package/whatever to take 3-D data and turn it into
: a wireframe surface with hidden lines removed. I'm using a DOS machine, and
: the code can be in ANSI C or C++, ANSI Fortran or Basic. The data I'm using
: forms a rectangular grid.
:    Please post your replies to the net so that others may benefit. IMHO, this
: is a general interest question.
:    Thank you!!!!!!


Su target es comp.graphics


I am working on a program to display 3d wireframe models with the user
being able to arbitrarily change any of the viewing parameters.  Also,
the wireframe objects are also going to have dynamic attributes so
that they can move around while the user is "exploring" the wireframe
world.

To do this, I am thinking of using the SRGP package described in the
Van Dam, Foley and Feiner book, but I was wondering if there was
another PD graphics package out there which was faster.  I would like
to make the program as fast as possible so that it provides
satisfactory real time 

#### Analizar documento 3

In [8]:
# El documento 3 es:
doc_idx = 2
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')


I've noticed some of you mentioning owning a Quadra 800 8/230 with CD300
and 1meg of VRAM.  It seems that this configuration was purchased
complete; that is, the CD300 and VRAM were already installed in the box. 
I am interested in that exact configuration and will be buying with an
educational discount but have not found the CD300 bundled with any Q800
smaller than the 8/500.
If you bought or know how to buy the 8/230 with CD installed, please let
me know what you know via email:
send messages to dmaluso@mhc.mtholyoke.edu
Thanks, all.
Diane Maluso
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
=     Diane Maluso            INTERNET:  dmaluso@mhc.mtholyoke.edu     =
=     Department of Psychology and Education                           =
=     Mount Holyoke College                                            =
=     South Hadley, MA  01075                                          =
=     (413) 538-2107                                                   =
=-=-=-=

In [9]:
# Los 5 más similares al documento 3 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')




    To display Millions of colors on a 16" monitor you need 2MB of VRAM
in the Q950.


    Correct. This is the amount of on-board VRAM that the Q800 comes
with.


    Yes this is possible. Technically, you only need to take out 2 of
the VRAM SIMMs but leaving in the other two will not get you anything
because the 950 cannot really do anything with 1.5MB of VRAM. It only
knows 1MB and 2MB.


    Yes, this is correct. You get to 1MB by putting 2 256k VRAM SIMMs
into the VRAM SIMM slots on the Q800's motherboard.


Su target es comp.sys.mac.hardware


Hi there,


I have a question regarding Quadras VRAM. I have tried to find info on this
but I could not get precise answers.

On one hand, we have a Quadra 950 with a 16" monitor, which is capable of
32-bit color. How much VRAM does it have?
On the other hand, we have a Quadra 800 with a 16" monitor, which is capable
of 8-bit color only, so it must have 512 Ko of VRAM.

I would like to take VRAM SIMMs for the 950 and put them in the 800 

#### Analizar documento 4

In [10]:
# El documento 4 es:
doc_idx = 3
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')



It's funny, though, coming from you.


Who is it that executes these "pin-point attacks" on Israelis?  The
guys in the white hats or the ones in the black hats?  Neither?  You
mean that they are just civilians, farmers, teachers, school children?
Well, maybe they ARE terrorists, after all?  And maybe that
"propaganda" was correct, too?  Hmm?


Su target es talk.politics.mideast


In [11]:
# Los 5 más similares al documento 4 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

# |> >
# |> >It is NOT a "terrorist camp" as you and the Israelis like 
# |> >to view the villages they are small communities with kids playing soccer
# |> >in the streets, women preparing lunch, men playing cards, etc.....
# |> >SOME young men, usually aged between 17 to 30 years are members of
# |> >the Lebanese resistance.  Even the inhabitants of the village do not 
# |> >know who these are, they are secretive about it, but most people often
# |> >suspect who they are and what they are up to.  These young men are
# |> >supported financially by Iran most of the time.  They sneak arms and
# |> >ammunitions into the occupied zone where they set up booby traps
# |> >for Israeli patrols.  Every time an Israeli soldier is killed or injured
# |> >by these traps, Israel retalliates by indiscriminately bombing villages
# |> >of their own choosing often killing only innocent civilians.  
# |> 
# |> This a "tried and true" method utilized by guerilla and terrorists groups:
# |> to conduct ope

#### Analizar documento 5

In [12]:
# El documento 5 es:
doc_idx = 4
print(newsgroups_train.data[indices[doc_idx]])

# Su target es:
target_doc = newsgroups_train.target_names[y_train[indices[doc_idx]]]
print(f'\nSu target es {target_doc}')


 +----------------------------------------------------------------------------+
 | Kevin Marshall, Operational Support, Motorola ECID, Swindon, UK.           |
 | E-mail   : marshalk@zeus                                                   |
 | Phone    : +44 793 545127 (International)    (0793) 545127 (Domestic)      |
 +----------------------------------------------------------------------------+


Su target es comp.os.ms-windows.misc


In [13]:
# Los 5 más similares al documento 5 son:
targets_accum = 0
for i in range(5):
    j = doc_to_similars[indices[doc_idx]][i]
    print(newsgroups_train.data[j])
    target_similar = newsgroups_train.target_names[y_train[j]]
    targets_accum += 1 if target_doc == target_similar else 0
    print(f'\nSu target es {target_similar}')
    print('\n====================================================\n')

print(f'\nEl {100 * targets_accum / 5}% dos los 5 más similares comparten el mismo target.')

I tried to E-mail you, but the message bounced.

Motorola has a University Support Program through which (I've been told) folks
at schools can get sample quantities of parts.  If you'd like to try this 
route, e-mail me for the address/phone #...I don't wish to post it for all the
world to see.

Su target es sci.electronics


Heavy-duty, commercial, TINY,(6x3x1/2 inch) WATERPROOF, VHF 2 watt, 2 channel,
handheld two-way radio.  MOTOROLA EXPO purchased NEW for Amateur frequencies
146.10/70 & 146.34/94.  Absolute  M I N T  condition!  Never scratched, dropped,
opened, or otherwise "comprosmised"!  Can be re-crystaled for business band.
has PL slot.  
                                           Original Price:

MOTOROLA EXPO VHF 2WATT/2CHAN. HT--------------------$1200.00
(comes with portable charger, antenna, manual, 
NEW Ni-Cad pack, back housing belt clip)
MOTOROLA extra NEW Ni-Cad pack-----------------------$  40.00
MOTOROLA extra VHF rubber-duckie antenna-------------$  12.50
MOTOROLA

### Comentarios

Se tomaron los 5 documentos más similares a cada uno de 5 documentos elegidos al azar. Se usó como métrica la distancia del coseno y a TF-IDF como técnica de vectorización. Para cada uno de los documentos analizados se calculó el porcentaje de los 5 más similares que poseen el mismo target que el documento analizado. En sucesivas pruebas se observó que a veces dicho porcentaje fue inferior al 50%, lo que sugiere que la técnica de vectorización empleada podría ser mejorada.

## Consigna 2

Entrenar modelos de clasificación Naive Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.

### Naive Bayes y Complement Naive Bayes usando la vectorización obtenida hasta aquí y parámetros por defecto

In [43]:
# Instanciar y fitear un multinomial Naive Bayes classifier.
mult_nb = MultinomialNB()
mult_nb.fit(X_train, y_train)

# Vectorizar set de datos de testing y realizar predict.
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  mult_nb.predict(X_test)

# Calcular F1-score macro.
print(f'F1 score macro con multinomial Naive Bayes:', f1_score(y_test, y_pred, average='macro'))

# Instanciar y fitear un complement Naive Bayes classifier.
compl_nb = ComplementNB()
compl_nb.fit(X_train, y_train)

# Realizar predict.
y_pred = compl_nb.predict(X_test)

# Calcular F1-score macro.
print(f'F1 score macro con complement Naive Bayes:', f1_score(y_test, y_pred, average='macro'))

F1 score macro con multinomial Naive Bayes: 0.5854345727938506
F1 score macro con complement Naive Bayes: 0.692953349950875
