In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test')

print("Training texts:", len(train_data.data))
print("Test texts:", len(test_data.data))

Training texts: 11314
Test texts: 7532


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# Pasamos el fichero a una lista (una línea por item)
with open('words.txt') as f:
    dictionary = f.read().splitlines()

In [9]:
vectorizer = CountVectorizer(binary= False, vocabulary=dictionary, stop_words='english', ngram_range = (1,1))

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
dir(train_data)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [38]:
train_data.target[0:100]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4,  8, 19,  4, 14,  6,  0,  1,
        7, 12,  5,  0, 10,  6,  2,  4,  1, 12,  9, 15,  7,  6, 13, 12, 17,
       18, 10,  8, 11,  8, 16,  9,  4,  3,  9,  9,  4,  4,  8, 12, 14,  5,
       15,  2, 13, 17, 11,  7, 10,  2, 14, 12,  5,  4,  6,  7,  0, 11, 16,
        0,  6, 17,  7, 12,  7,  3, 12, 11,  7,  2,  2,  0, 16,  1,  2,  7,
        3,  2,  1, 10, 12, 12, 17, 12,  2,  8,  8, 18,  5,  0,  1])

Esto es para saber que hay suficientes elementos de cada tipo, por lo que podremos hacer selecciones aleatorias hasta tener 3 de cada

In [182]:
import numpy as np

In [48]:
unique, counts = np.unique(train_data.target, return_counts=True)
dict(zip(unique, counts))

{0: 480,
 1: 584,
 2: 591,
 3: 590,
 4: 578,
 5: 593,
 6: 585,
 7: 594,
 8: 598,
 9: 597,
 10: 600,
 11: 595,
 12: 591,
 13: 594,
 14: 593,
 15: 599,
 16: 546,
 17: 564,
 18: 465,
 19: 377}

In [18]:
train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [20]:
import numpy as np
import numpy.ma as ma

def write_terms (feature_names, data, vector_data, index):
    '''
    Escribe los términos presentes en un mensaje representado como bolsa de palabras.
    
    - feature_names: terminos usados para vectorizar
    - data: lista de mensajes original (si data==None no se muestra el mensaje original)
    - vector_data: matriz (dispersa) de mensaje vectorizados
    - index: posición del mensaje a mostrar
    '''
    # máscara para seleccionar sólo el mensaje en posición index
    mask=vector_data[index,:]>0
    
    # términos que aparecen en ese mensaje vectorizado
    terminos = ma.array(feature_names, mask = ~(mask[0].toarray()))
    
    # mostrar mensaje original
    if data:
        print('Mensaje', index, ':', data[index])
    
    # mostrar términos que aparecen en el mensaje vectorizado
    print('Mensaje', index, 'vectorizado:', terminos.compressed(),'\n')

In [21]:
train_vector_data = vectorizer.fit_transform(train_data.data)

In [22]:
feature_names = vectorizer.get_feature_names()

In [31]:
print(feature_names[265072])

number


In [35]:
write_terms(feature_names, None, train_vector_data, 1)

Mensaje 1 vectorizado: ['acceleration' 'adapters' 'answered' 'article' 'attained' 'brave' 'cards'
 'clock' 'days' 'detailing' 'disk' 'especially' 'experiences' 'final'
 'floppy' 'floppies' 'functionality' 'heat' 'hour' 'keywords' 'knowledge'
 'lines' 'message' 'network' 'number' 'organization' 'oscillator'
 'posting' 'procedure' 'rated' 'reports' 'requested' 'send' 'shared'
 'sinks' 'souls' 'speed' 'subject' 'summary' 'summarizing' 'thanks'
 'upgrade' 'upgraded' 'usage'] 



Procedemos a elegir 3 mensajes de cada tipo

In [243]:
import random

selected = [0]*19
selected_index = []
number_samples = 3

In [190]:
len(selected)

19

In [250]:
#while the last element of the sum of all the previous elements (which is the sum of the total selectec texts) 
#    is less than the desired amount, it will keep choosing candidates

while (np.cumsum(selected)[-1] < len(selected)*3):

    #Indice aleatorio de la lista de mensajes
    index = random.randint(0, len(train_data.target))
    #cat = numero de la categoria a la que pertenece el índice elegido
    cat = train_data.target[index] - 1
    #Comprobamos que no haya ya 3 elementos de la categoría a la que pertenezca el index
    if(selected[cat] < 3):
        selected[cat] += 1
        
    selected_index.append(index)

In [245]:
selected

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [246]:
np.cumsum(selected)[-1]

57