# Datos para Streamlit

En este notebook sólo enseño de dónde obtuve los datos para la visualización de Streamlit

In [1]:
import mysql.connector as mariadb
import sys

try:
    conn = mariadb.connect(
      user="root",
      password="root",
      host="127.0.0.1",
      port=3306
      )
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)
cursor=conn.cursor()
cursor.execute("USE %s"%"noticias")


In [2]:
def get_corpus(seleccion= ["noticia"], fecha="fecha"):
    seleccion=seleccion+[fecha]
    cursor.execute("SELECT %s FROM notas WHERE fecha = %s"%tuple(seleccion))
    
    resultados=cursor.fetchall()
    return resultados
    

In [3]:
corpus=get_corpus(["noticia"],"fecha")

In [4]:
len(corpus)

2506

## Procesamiento de lenguaje

In [5]:
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import spacy


from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

import matplotlib.colors as mcolors
from collections import Counter


In [6]:
stemmer = SnowballStemmer('spanish')
nlp = spacy.load('es_core_news_sm')

In [7]:
def normalizar(texto):
    doc = nlp(texto)
    palabras = [t.orth_ for t in doc if not (t.is_punct | t.is_stop) and t.pos_ != 'PRON']
    tokens = [t.lower() for t in palabras if len(t) > 3 and t.isalpha()]
    raices = [stemmer.stem(token) for token in tokens]
    return raices

In [8]:
word_list = normalizar("Esto es lo que uno debería esperar después de que se pre-procesa un texto")

In [9]:
word_list

['deb', 'esper', 'text']

In [10]:
def modificar_corpus(corpus):
    corpus_modificado=[]
    for noticia in corpus:
        corpus_modificado.append(normalizar(noticia[0]))
    return corpus_modificado
    

In [11]:
CM=modificar_corpus(corpus)

## Aprendizaje No Supervisado

In [12]:
import gensim
from gensim import models
import gensim.corpora as corpora

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from gensim.models import LdaModel, CoherenceModel

In [13]:
dictionary=corpora.Dictionary(CM)

In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [15]:
bow_corpus = [dictionary.doc2bow(noticia) for noticia in CM]

In [16]:
num_of_topics=8

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_of_topics, id2word=dictionary, 
                                       passes=13, workers=2,
                                      random_state=1,per_word_topics=True)

## Resultados

In [18]:
import pandas as pd

In [19]:
obj=lda_model[bow_corpus]

In [20]:
results=[row[0] for row in obj]

In [21]:
len(results)

2506

In [29]:
df=pd.DataFrame(columns=[0,1,2,3,4,5,6,7])

In [31]:
for result in results:
    d={}
    for element in result:
        d[element[0]]=element[1]
    insert=[0]*num_of_topics
    
    for i in d.keys():
        insert[i]=d[i]
    df=df.append(pd.Series(insert),ignore_index=True)

In [32]:
def get_topic_DF(ldamodel, corpus, bow_texts,original,num_of_topics):

    topic_df = pd.DataFrame()
    
    #extraemos las palabras clave por tema en un diccionario
    topic_keywords={}
    for topic_num in range(num_of_topics):
        wp = ldamodel.show_topic(topic_num) #vector de tuplas de la forma (palabra, contribucion)
        topic_keywords[topic_num] = ", ".join([word for word, prob in wp])
    
    
    #para cada documento sacamos su tema dominante, su probabilidad y las palabras clave del tema
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0]
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        dominant=row[0]
        topic_num=dominant[0]
        prob_topic=dominant[1]
        
        topic_df = topic_df.append(pd.Series([int(topic_num), round(prob_topic,4), topic_keywords[topic_num]]), ignore_index=True)

    
    #añadimos los textos originales al DF
    contents = pd.Series(bow_texts)
    original = pd.Series(original)

    topic_df = pd.concat([topic_df, contents,original], axis=1)
    topic_df.columns = ['Tema Principal', 'Contribucion', 'Palabras Clave','Texto Normalizado', 'Texto Original']

    return(topic_df)


topic_df = get_topic_DF(lda_model, bow_corpus,CM,corpus,num_of_topics)

topic_df.head(10)

Unnamed: 0,Tema Principal,Contribucion,Palabras Clave,Texto Normalizado,Texto Original
0,0.0,0.9948,"fot, public, instagram, leer, compart, amor, c...","[hollywood, comun, famos, sorprend, romanc, ju...",(En Hollywood es común que los famosos sorpren...
1,0.0,0.6709,"fot, public, instagram, leer, compart, amor, c...","[fri, nev, complic, segur, aument, consum, caf...",(Aunque el frío y las nevadas ha complicado a ...
2,0.0,0.7906,"fot, public, instagram, leer, compart, amor, c...","[asad, escabech, herv, simplement, acompañ, pl...","(Asado, en escabeche, hervido o simplemente a..."
3,0.0,0.9948,"fot, public, instagram, leer, compart, amor, c...","[cab, dud, necesit, termin, seman, broch, darl...",(No cabe duda que necesitábamos terminar la se...
4,0.0,0.6684,"fot, public, instagram, leer, compart, amor, c...","[abuelit, vacun, mes, inic, pandemi, coronavir...",(¿Tus abuelitos ya se vacunaron contra el covi...
5,2.0,0.8835,"vacun, dosis, salud, millon, mayor, president,...","[promedi, person, acud, institut, nacional, el...","(En promedio, 120 mil personas acudieron al In..."
6,0.0,0.9219,"fot, public, instagram, leer, compart, amor, c...","[pandemi, coronavirus, provoc, imagin, person,...",(La pandemia de coronavirus ha provocado que l...
7,5.0,0.9869,"años, mexican, equip, histori, libr, part, mex...","[anders, verjgang, jugador, profesional, famos...","(Anders Verjgang, el jugador profesional más f..."
8,0.0,0.7271,"fot, public, instagram, leer, compart, amor, c...","[duqu, sussex, harry, megh, esper, hij, portav...","(Los duques de Sussex, Harry y Meghan, están e..."
9,4.0,0.9123,"electr, mexic, aut, pes, usuari, servici, pais...","[miercol, usuari, servici, internet, izzi, rep...","(Este miércoles, usuarios del servicio de inte..."


In [33]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.994782,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.662904,0.000000,0.000000,0.0,0.249694,0.079374,0.000000,0.000000
2,0.790555,0.000000,0.000000,0.0,0.000000,0.000000,0.201852,0.000000
3,0.994815,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,0.668393,0.000000,0.176966,0.0,0.000000,0.000000,0.150628,0.000000
...,...,...,...,...,...,...,...,...
2501,0.000000,0.487062,0.119670,0.0,0.000000,0.000000,0.000000,0.388859
2502,0.737844,0.046874,0.000000,0.0,0.000000,0.083661,0.000000,0.129799
2503,0.664313,0.000000,0.000000,0.0,0.253723,0.000000,0.000000,0.079156
2504,0.118673,0.000000,0.025948,0.0,0.852150,0.000000,0.000000,0.000000


In [34]:
df.columns=(["tema0","tema1","tema2","tema3","tema4","tema5","tema6","tema7"])

In [38]:
DF=pd.concat([df,topic_df],axis=1)

In [39]:
DF

Unnamed: 0,tema0,tema1,tema2,tema3,tema4,tema5,tema6,tema7,Tema Principal,Contribucion,Palabras Clave,Texto Normalizado,Texto Original
0,0.994782,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.9948,"fot, public, instagram, leer, compart, amor, c...","[hollywood, comun, famos, sorprend, romanc, ju...",(En Hollywood es común que los famosos sorpren...
1,0.662904,0.000000,0.000000,0.0,0.249694,0.079374,0.000000,0.000000,0.0,0.6709,"fot, public, instagram, leer, compart, amor, c...","[fri, nev, complic, segur, aument, consum, caf...",(Aunque el frío y las nevadas ha complicado a ...
2,0.790555,0.000000,0.000000,0.0,0.000000,0.000000,0.201852,0.000000,0.0,0.7906,"fot, public, instagram, leer, compart, amor, c...","[asad, escabech, herv, simplement, acompañ, pl...","(Asado, en escabeche, hervido o simplemente a..."
3,0.994815,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.9948,"fot, public, instagram, leer, compart, amor, c...","[cab, dud, necesit, termin, seman, broch, darl...",(No cabe duda que necesitábamos terminar la se...
4,0.668393,0.000000,0.176966,0.0,0.000000,0.000000,0.150628,0.000000,0.0,0.6684,"fot, public, instagram, leer, compart, amor, c...","[abuelit, vacun, mes, inic, pandemi, coronavir...",(¿Tus abuelitos ya se vacunaron contra el covi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,0.000000,0.487062,0.119670,0.0,0.000000,0.000000,0.000000,0.388859,1.0,0.4863,"millon, cient, empres, xic, pes, ciud, servici...","[yucatan, otorg, credit, subsidi, incent, fisc...","(Yucatán.— Por el otorgamiento de créditos, su..."
2502,0.737844,0.046874,0.000000,0.0,0.000000,0.083661,0.000000,0.129799,0.0,0.7378,"fot, public, instagram, leer, compart, amor, c...","[chef, libanes, deeb, harak, anunci, proxim, a...",(El chef libanés Deeb Harake anuncia la próxim...
2503,0.664313,0.000000,0.000000,0.0,0.253723,0.000000,0.000000,0.079156,0.0,0.6643,"fot, public, instagram, leer, compart, amor, c...","[contingent, millon, person, mund, recurr, pla...",(Debido a la contingencia por Covid-19 millone...
2504,0.118673,0.000000,0.025948,0.0,0.852150,0.000000,0.000000,0.000000,4.0,0.8521,"electr, mexic, aut, pes, usuari, servici, pais...","[model, celul, comenz, parec, fabric, busc, di...",(Cuando todos los modelos de celular comenzaro...


In [40]:
DF.to_csv('data.csv', index = False)