## Clasificación TODOS LOS MODELOS

In [20]:
import pandas as pd
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from pprint import pprint

import numpy as np
import spacy
import unicodedata
import os
#from spellchecker import SpellChecker 
#from textblob import TextBlob 
#import contractions
import re
import random


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC

Lo primero que necesitamos es cargar nuestros glosarios de términos para crear nuestro diccionario

In [3]:
def cargar_glosario(categoria, conjunto):
    fname = f"../Datos/Glosarios/{conjunto}/glosario_{categoria}.txt"

    glosario = []
    with open(fname, 'r') as f:
        glosario = [termino.rstrip('\n') for termino in f.readlines()]
    return glosario

In [11]:
categorias = ["deportes", "salud", "ciencia", "politica"]

glosarios = {}
for categoria in categorias:
    glosarios[categoria] = cargar_glosario(categoria, "train")

pprint(glosarios)

{'ciencia': ['vacao',
             'llama',
             'captura',
             'cola',
             'honor',
             'gemanidas',
             'banyoles',
             'neandertal',
             'agujero',
             'latigo',
             'supersa',
             'perfecto',
             'regalo',
             'bankman',
             'fried',
             'navidad',
             'estampido',
             'ftx',
             'york',
             'menta',
             'supermasivo',
             'barrera',
             'segundo',
             'ingeniero',
             'magic',
             'reencuentro',
             'leo',
             'nif',
             'cernan',
             'fauci'],
 'deportes': ['falso',
              'jamas',
              'smash',
              'mbappe',
              'gigante',
              'reserva',
              'nets',
              'carpena',
              'djokovic',
              'mclaren',
              'exencion',
              'butler',
    

In [12]:
glosarios_t = {}
for categoria in categorias:
    glosarios_t[categoria] = cargar_glosario(categoria, "test")

pprint(glosarios_t)


{'ciencia': ['sal',
             'cuantico',
             'idioma',
             'fraa',
             'vacuna',
             'isidro',
             'santo',
             'llnl',
             'agujero',
             'sonido',
             'antageno',
             'subtipo',
             'quipus',
             'congelacia',
             'ska',
             'ofensivo',
             'palabrota',
             'canaria',
             'relatividad',
             'estruendo',
             'programador',
             'telescopio',
             'congelar',
             'gripe',
             'alineacia',
             'almanaque',
             'escarcha',
             'helada',
             'sevilla',
             'alphacode'],
 'deportes': ['seguidor',
              'carlos',
              'mans',
              'warren',
              'doncic',
              'suarez',
              'juancho',
              'boston',
              'estabilidad',
              'horford',
              'hernanga',
 

Creamos nuestro diccionario de palabras en base a los terminos de todos los glosarios

In [13]:
def create_dictionary(glosarios):
    doc_tokens = [[termino for termino in glosario] for glosario in glosarios.values()]
    dictionary = corpora.Dictionary(doc_tokens)
    return dictionary

In [15]:
glosarios_dict = create_dictionary(glosarios)
print(glosarios_dict)

print("-"*100)

glosarios_dict_t = create_dictionary(glosarios_t)
print(glosarios_dict_t)

Dictionary<140 unique tokens: ['arabia', 'booker', 'butler', 'campazzo', 'carpena']...>
----------------------------------------------------------------------------------------------------
Dictionary<139 unique tokens: ['aerodinamico', 'booker', 'boston', 'brooklyn', 'cancha']...>


Cargamos nuestras noticias de test y las convertimos a un bag of words utilizando nuestro diccionario

In [16]:
noticias_test_dataframe = pd.read_csv("../Datos/noticias_train.csv")
noticias_test_dataframe_t = pd.read_csv("../Datos/noticias_test.csv")

In [17]:
def create_bag_of_words(docs_list, dictionary):
    doc_tokens = [simple_preprocess(corpus) for corpus in docs_list]
    docs_bow = [dictionary.doc2bow(doc) for doc in doc_tokens]
    return docs_bow

Ahora es momento de crear una bag of words para cada noticia en base a nuestro diccionario

In [18]:
docs_bow = create_bag_of_words(noticias_test_dataframe["corpus"].values, glosarios_dict)
print(docs_bow[19])

print("-"*100)

docs_bow_t = create_bag_of_words(noticias_test_dataframe_t["corpus"].values, glosarios_dict_t)
print(docs_bow_t[19])

[(1, 3), (36, 4), (47, 3), (84, 1)]
----------------------------------------------------------------------------------------------------
[(0, 5), (23, 6), (32, 6), (77, 3)]


Sacamos las caracteristicas

In [21]:
#Tf-idf
tfidf_vectorizer = TfidfTransformer()

#Funciones de WV.
def averaged_word_vectorizer(corpus):
    '''Aplica la función de cálculo del WE promedio a todos los
    documentos del corpus (cada doc es una lista de tokens)'''
    features = [nlp(doc).vector
                    for doc in corpus]
    return np.array(features)

def tfidf_wtd_avg_word_vectors(doc, word_tfidf_map):
    '''Aplica la función de cálculo del WE ponderado por TF-IDF
    a un documento (como lista de tokens)'''
    tokens = doc.split()

    feature_vector = np.zeros((nlp.vocab.vectors_length,),dtype="float64")
    wts = 0.      
    for word in tokens:
        if nlp.vocab[word].has_vector and word_tfidf_map.get(word, 0): #sólo considera palabras conocidas
            weighted_word_vector = word_tfidf_map[word] * nlp.vocab[word].vector
            wts = wts + 1
            feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
        
    return feature_vector
    
def tfidf_weighted_averaged_word_vectorizer(corpus, word_tfidf_map):
    '''Aplica la función de cálculo del WE ponderado por TF-IDF a todos los
    documentos del corpus (cada doc es una lista de tokens)'''                                       
    features = [tfidf_wtd_avg_word_vectors(doc, word_tfidf_map)
                    for doc in corpus]
    return np.array(features)

In [22]:
# características bag of words
bow_train_features = docs_bow
bow_test_features = docs_bow_t


# características tfidf (a partir del BoW)
tfidf_train_features = tfidf_vectorizer.fit_transform(bow_train_features)
tfidf_test_features = tfidf_vectorizer.transform(bow_test_features)    

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: Expected 2D array, got 1D array instead:
array=[list([(8, 1), (9, 4), (12, 1), (26, 3), (99, 2)])
 list([(31, 1), (105, 1), (133, 2)]) list([])
 list([(97, 2), (102, 1), (103, 1), (105, 1), (114, 2)])
 list([(14, 3), (18, 2), (23, 3), (45, 3)]) list([(93, 1), (105, 1)])
 list([(5, 4), (6, 1), (48, 6), (92, 1)]) list([(16, 5), (133, 1)])
 list([]) list([(19, 5), (20, 5), (72, 2), (133, 2)])
 list([(28, 6), (36, 1), (46, 6), (92, 1)]) list([(30, 5)]) list([])
 list([]) list([(42, 3), (133, 1)]) list([])
 list([(15, 3), (21, 3), (27, 2), (37, 2)]) list([(7, 3), (32, 6)])
 list([(3, 5), (6, 1), (27, 7), (37, 7), (118, 1)])
 list([(1, 3), (36, 4), (47, 3), (84, 1)])
 list([(22, 3), (31, 7), (34, 3), (38, 5)])
 list([(93, 1), (125, 1), (133, 1)])
 list([(6, 5), (8, 4), (25, 3), (33, 3), (40, 3)]) list([(0, 3)])
 list([(12, 6), (17, 4), (26, 6), (35, 4), (95, 1)])
 list([(2, 3), (24, 2), (109, 1), (139, 1)])
 list([(4, 3), (10, 3), (29, 2), (39, 2), (43, 2), (44, 2)])
 list([(11, 4), (105, 1)]) list([(13, 8), (49, 5), (133, 2)])
 list([(41, 3), (133, 5)]) list([(56, 8), (63, 15), (77, 13), (119, 1)])
 list([(37, 1), (73, 5)]) list([(19, 2), (27, 2), (114, 1), (129, 2)])
 list([(55, 20), (58, 15), (133, 1)]) list([]) list([(69, 5), (74, 4)])
 list([(76, 3)]) list([(54, 22)]) list([(50, 21), (133, 1)])
 list([(109, 1)]) list([]) list([(65, 3), (78, 5), (105, 1)])
 list([(19, 1)]) list([(12, 1), (133, 1)]) list([(62, 12)])
 list([(66, 7), (67, 4), (70, 4)]) list([(52, 7), (114, 1), (116, 1)])
 list([(123, 1)]) list([(14, 2), (79, 5), (102, 1), (109, 1), (123, 4)])
 list([(59, 6), (72, 1), (105, 1)]) list([(79, 14), (105, 1)]) list([])
 list([(51, 6), (57, 5), (61, 6), (64, 6), (114, 1)]) list([(123, 1)])
 list([(6, 1), (19, 1), (27, 2), (60, 26)])
 list([(68, 9), (93, 1), (125, 2)]) list([(53, 21), (72, 17)]) list([])
 list([(71, 6), (75, 5)]) list([(26, 1), (133, 3)])
 list([(80, 2), (105, 7), (116, 1)])
 list([(14, 1), (18, 1), (33, 6), (37, 1), (132, 1), (139, 1)])
 list([(85, 1)]) list([(27, 1)]) list([(93, 3)]) list([(124, 1)])
 list([(88, 6), (116, 3)]) list([(37, 1), (114, 2), (119, 2), (133, 1)])
 list([(81, 7), (89, 7), (90, 6), (109, 6)]) list([(26, 1)])
 list([(91, 5)]) list([(37, 1), (83, 10), (119, 1)])
 list([(33, 1), (38, 1), (43, 1), (95, 3), (137, 1)])
 list([(72, 1), (101, 1), (110, 1)])
 list([(26, 1), (82, 6), (98, 4), (100, 6)]) list([(18, 1)])
 list([(108, 27)]) list([(27, 1), (101, 5), (110, 1)])
 list([(113, 1), (116, 1)]) list([(84, 5), (86, 1)])
 list([(14, 1), (85, 6)]) list([(27, 2), (136, 1)])
 list([(18, 2), (67, 1)]) list([(36, 1), (96, 11), (99, 1)])
 list([(23, 1), (92, 8), (97, 4), (99, 10), (102, 6), (103, 4), (104, 6)])
 list([(18, 1), (86, 12), (87, 4), (94, 5), (107, 5)]) list([(110, 1)])
 list([]) list([(18, 2), (80, 21), (95, 10), (106, 6)]) list([])
 list([(26, 1), (27, 1), (83, 2), (132, 1), (133, 1), (137, 11)])
 list([(75, 4), (111, 4), (127, 1)]) list([(0, 2), (37, 3), (116, 2)])
 list([(72, 1)]) list([(33, 1), (96, 1)]) list([(26, 1)])
 list([(19, 1), (27, 3), (39, 1), (54, 1), (115, 1), (133, 1)])
 list([(33, 1)]) list([(14, 1), (27, 4), (37, 1), (125, 5)])
 list([(27, 1), (118, 3), (135, 4)]) list([(27, 2), (33, 1), (96, 2)])
 list([(27, 2), (29, 1), (102, 1)]) list([(113, 1), (119, 4), (121, 3)])
 list([(26, 1), (112, 6), (117, 5), (120, 6)])
 list([(14, 1), (72, 1), (116, 15), (134, 11)])
 list([(99, 1), (110, 2), (118, 14), (129, 2), (130, 5), (132, 2), (135, 13)])
 list([(114, 2), (131, 7)]) list([(105, 1)]) list([(26, 1)])
 list([(110, 2), (113, 2), (115, 2), (129, 3), (130, 5)])
 list([(124, 1), (127, 8)]) list([(75, 1), (132, 1)])
 list([(12, 1), (27, 1), (127, 8), (136, 4), (139, 8)]) list([])
 list([(138, 7)]) list([(27, 1), (33, 1), (114, 1)])
 list([(14, 3), (126, 3)]) list([(123, 2), (132, 5), (133, 6)])
 list([(114, 5), (122, 3), (131, 5), (138, 5)]) list([(124, 7), (128, 4)])].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.