In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test')

print("Training texts:", len(train_data.data))
print("Test texts:", len(test_data.data))

Training texts: 11314
Test texts: 7532


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# Pasamos el fichero a una lista (una línea por item)
with open('words.txt') as f:
    dictionary = f.read().splitlines()

In [9]:
vectorizer = CountVectorizer(binary= False, vocabulary=dictionary, stop_words='english', ngram_range = (1,1))

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
dir(train_data)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [38]:
train_data.target[0:100]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4,  8, 19,  4, 14,  6,  0,  1,
        7, 12,  5,  0, 10,  6,  2,  4,  1, 12,  9, 15,  7,  6, 13, 12, 17,
       18, 10,  8, 11,  8, 16,  9,  4,  3,  9,  9,  4,  4,  8, 12, 14,  5,
       15,  2, 13, 17, 11,  7, 10,  2, 14, 12,  5,  4,  6,  7,  0, 11, 16,
        0,  6, 17,  7, 12,  7,  3, 12, 11,  7,  2,  2,  0, 16,  1,  2,  7,
        3,  2,  1, 10, 12, 12, 17, 12,  2,  8,  8, 18,  5,  0,  1])

In [329]:
set(train_data.target)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [330]:
set(train_data.target_names)

{'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc'}

Esto es para saber que hay suficientes elementos de cada tipo, por lo que podremos hacer selecciones aleatorias hasta tener 3 de cada

In [182]:
import numpy as np

In [48]:
unique, counts = np.unique(train_data.target, return_counts=True)
dict(zip(unique, counts))

{0: 480,
 1: 584,
 2: 591,
 3: 590,
 4: 578,
 5: 593,
 6: 585,
 7: 594,
 8: 598,
 9: 597,
 10: 600,
 11: 595,
 12: 591,
 13: 594,
 14: 593,
 15: 599,
 16: 546,
 17: 564,
 18: 465,
 19: 377}

In [18]:
train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [20]:
import numpy as np
import numpy.ma as ma

def write_terms (feature_names, data, vector_data, index):
    '''
    Escribe los términos presentes en un mensaje representado como bolsa de palabras.
    
    - feature_names: terminos usados para vectorizar
    - data: lista de mensajes original (si data==None no se muestra el mensaje original)
    - vector_data: matriz (dispersa) de mensaje vectorizados
    - index: posición del mensaje a mostrar
    '''
    # máscara para seleccionar sólo el mensaje en posición index
    mask=vector_data[index,:]>0
    
    # términos que aparecen en ese mensaje vectorizado
    terminos = ma.array(feature_names, mask = ~(mask[0].toarray()))
    
    # mostrar mensaje original
    if data:
        print('Mensaje', index, ':', data[index])
    
    # mostrar términos que aparecen en el mensaje vectorizado
    print('Mensaje', index, 'vectorizado:', terminos.compressed(),'\n')

In [21]:
train_vector_data = vectorizer.fit_transform(train_data.data)

In [334]:
print(train_vector_data)

  (0, 4151)	1
  (0, 52160)	1
  (0, 53807)	1
  (0, 56907)	1
  (0, 78575)	1
  (0, 112523)	1
  (0, 112565)	1
  (0, 124009)	1
  (0, 124372)	1
  (0, 149051)	1
  (0, 177802)	1
  (0, 184493)	1
  (0, 185287)	1
  (0, 190955)	1
  (0, 208543)	1
  (0, 213043)	1
  (0, 218699)	1
  (0, 221810)	1
  (0, 221819)	1
  (0, 225798)	1
  (0, 242690)	1
  (0, 251793)	1
  (0, 271493)	1
  (0, 306014)	1
  (0, 314710)	1
  :	:
  (11313, 12051)	1
  (11313, 31047)	1
  (11313, 110477)	1
  (11313, 124009)	1
  (11313, 163078)	1
  (11313, 170696)	1
  (11313, 192874)	1
  (11313, 217697)	1
  (11313, 218699)	1
  (11313, 239824)	1
  (11313, 265072)	2
  (11313, 271493)	1
  (11313, 306014)	1
  (11313, 340787)	1
  (11313, 359935)	1
  (11313, 360545)	1
  (11313, 365530)	1
  (11313, 385115)	2
  (11313, 389038)	1
  (11313, 391937)	1
  (11313, 398618)	1
  (11313, 400462)	1
  (11313, 410906)	1
  (11313, 413140)	1
  (11313, 419956)	1


In [22]:
feature_names = vectorizer.get_feature_names()

In [31]:
print(feature_names[265072])

number


In [35]:
write_terms(feature_names, None, train_vector_data, 1)

Mensaje 1 vectorizado: ['acceleration' 'adapters' 'answered' 'article' 'attained' 'brave' 'cards'
 'clock' 'days' 'detailing' 'disk' 'especially' 'experiences' 'final'
 'floppy' 'floppies' 'functionality' 'heat' 'hour' 'keywords' 'knowledge'
 'lines' 'message' 'network' 'number' 'organization' 'oscillator'
 'posting' 'procedure' 'rated' 'reports' 'requested' 'send' 'shared'
 'sinks' 'souls' 'speed' 'subject' 'summary' 'summarizing' 'thanks'
 'upgrade' 'upgraded' 'usage'] 



In [262]:
type(train_vector_data)

scipy.sparse.csr.csr_matrix

Procedemos a elegir 3 mensajes de cada tipo

In [263]:
import random

In [190]:
len(selected)

19

In [324]:
#while the last element of the sum of all the previous elements (which is the sum of the total selectec texts) 
#    is less than the desired amount, it will keep choosing candidates

selected = [0]*len(train_data.target_names)

number_samples = 3

selected_index = [0]*len(train_data.target_names)*number_samples

while (np.cumsum(selected)[-1] < len(selected)*number_samples):

    #Indice aleatorio de la lista de mensajes
    index = random.randint(0, len(train_data.target))
    #cat = numero de la categoria a la que pertenece el índice elegido
    cat = train_data.target[index]
    #Comprobamos que no haya ya 3 elementos de la categoría a la que pertenezca el index y lo añadimos a la cuenta y a la lista de índices
    #La lista de índices estará ordenada por categorías
    if(selected[cat] < 3):
        selected_index[cat*number_samples + selected[cat]] = index
        selected[cat] += 1

In [325]:
selected

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [326]:
np.cumsum(selected)[-1]

60

In [327]:
len(selected_index)

60

In [328]:
selected_index

[2252,
 648,
 1054,
 3496,
 4017,
 2472,
 8424,
 6318,
 9744,
 7022,
 11193,
 8097,
 5928,
 8737,
 3815,
 3850,
 7063,
 10923,
 6330,
 8413,
 3780,
 2834,
 9789,
 11077,
 3519,
 8821,
 3910,
 10579,
 10810,
 7588,
 4286,
 4783,
 9175,
 2899,
 3783,
 2388,
 9137,
 3014,
 8743,
 9489,
 4761,
 2360,
 6387,
 10371,
 10839,
 5696,
 2590,
 11253,
 10698,
 7333,
 8503,
 4288,
 6442,
 8847,
 8532,
 3316,
 9622,
 4805,
 7312,
 692]

In [331]:
from sklearn.metrics.pairwise import cosine_similarity

In [348]:
type(train_vector_data[2252])

scipy.sparse.csr.csr_matrix

In [357]:
indices = selected_index[0:3]
print(indices)
out1 = train_vector_data.tocsc()[indices,:]

[2252, 648, 1054]


In [358]:
print(out1)

  (2, 9)	1
  (1, 1425)	1
  (0, 1517)	1
  (1, 1804)	1
  (2, 3763)	1
  (2, 4194)	1
  (2, 6470)	1
  (2, 6989)	2
  (0, 11102)	1
  (1, 14528)	1
  (2, 20371)	1
  (2, 21179)	1
  (1, 25101)	1
  (2, 25101)	2
  (0, 25101)	1
  (0, 26597)	1
  (0, 27385)	1
  (0, 27387)	1
  (1, 28166)	1
  (2, 36460)	1
  (1, 38124)	2
  (2, 56946)	1
  (2, 59997)	1
  (1, 61234)	1
  (2, 61477)	1
  :	:
  (2, 389038)	2
  (0, 389038)	1
  (0, 391187)	1
  (2, 404001)	1
  (1, 405626)	1
  (2, 405626)	1
  (2, 405634)	1
  (0, 406443)	1
  (1, 406503)	1
  (2, 407247)	1
  (1, 408810)	1
  (1, 428525)	1
  (1, 445173)	1
  (1, 448144)	1
  (2, 448756)	2
  (1, 448800)	1
  (2, 453242)	2
  (2, 459301)	1
  (2, 461063)	1
  (2, 463885)	1
  (0, 463885)	1
  (2, 463913)	1
  (1, 463924)	1
  (1, 463986)	2
  (2, 463986)	1


In [343]:
cosSim = cosine_similarity(train_vector_data[2252], train_vector_data)

In [346]:
print(max(cosSim))

[0.08141255 0.05307449 0.12856801 ... 0.04450431 0.11214033 0.06394568]
