<a href="https://colab.research.google.com/github/AnIsAsPe/LDATopicModeling_pyspark/blob/main/LDA_con_sklearn_noticias_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalar y cargar bibliotecas

In [1]:
!pip install pyLDAvis  #biblioteca que extrae información de un modelo LDA para obtener una visualización interactiva

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting pandas>=1.2.0
  Downloading pandas-1.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 26.4 MB/s 
Collecting numpy>=1.20.0
  Downloading numpy-1.21.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 403 kB/s 
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136897 sha256=00b5112fbeff6a

In [2]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

import pyLDAvis
import matplotlib.pyplot as plt 
import seaborn as sns  

nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Lectura y exploración de datos

En colab, al instalar la librería pyLDAvis causa un conflicto con Pandas. Si esto se refleja al correr la siguiente linea es necesario reiniciar el entorno de ejecución.

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datos/abcnews-date-text.csv', parse_dates=['publish_date'] )
print(df.shape)
df.head()

(1226258, 2)


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [6]:
df.publish_date.min(), df.publish_date.max()

(Timestamp('2003-02-19 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [7]:
# promedio de noticias por día
len(df)/(df.publish_date.max()-df.publish_date.min()).days


187.93226053639847

In [8]:

filtro = (df['publish_date']>= '2020-01-01')&(df['publish_date']<= '2020-12-31')
df = df[filtro]
len(df)

40240

# Vectorización de textos

In [9]:
stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
stop

re.compile(r"\b(i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|nor|not|only|own|same|so|than|too|very|s|t|can|will|just|don|don't|should|should've|now|d|ll|m|o|re|ve|y|ain|aren|aren't|couldn|couldn't|didn|didn't|doesn|doesn't|hadn|hadn't|hasn|hasn't|haven|haven't|isn|isn't|ma|mightn|mightn't|mustn|mustn't|needn|needn't|shan|shan't|shouldn|shouldn't|wasn|wasn't|weren|weren't|won|won't|wouldn|wouldn't)\b\s*",
re.UNICODE)

In [10]:
def preprocesar(texto):
  #convierte a minúsculas
  texto = (texto).lower()

  #elimina stopwords
  stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  texto = stop.sub('', texto) 

  #lematizar y quedarnos con palabras que tengan más de tres caracteres
  lemmatizer = WordNetLemmatizer()
  texto = texto.split()
  texto = ' '.join([lemmatizer.lemmatize(i) for i in texto if len(i)>3])
  
  return(texto)

In [11]:
%%time
df['headline_pp']=df['headline_text'].apply(preprocesar)
df.head()

CPU times: user 9.66 s, sys: 950 ms, total: 10.6 s
Wall time: 10.6 s


In [12]:
df.head()

Unnamed: 0,publish_date,headline_text,headline_pp
1186018,2020-01-01,a new type of resolution for the new year,type resolution year
1186019,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade
1186020,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight year firework
1186021,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank
1186022,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...


In [13]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1,2))
BOW = vectorizer.fit_transform(df['headline_pp'])
BOW.shape

(40240, 16579)

In [33]:
vocabulario = vectorizer.get_feature_names()
len(vocabulario)

16579

# Entrenamiento del modelo

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online',random_state=42,max_iter=50) 

In [36]:
#%%time
#lda_model.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

### Matriz documento tópico

In [18]:
doc_top = pd.DataFrame(lda_model.transform(BOW))
print(doc_top.shape)
doc_top.head()

(40240, 5)


Unnamed: 0,0,1,2,3,4
0,0.39443,0.380654,0.074977,0.074743,0.075196
1,0.199485,0.630972,0.056487,0.056418,0.056639
2,0.165149,0.666773,0.056001,0.055938,0.056139
3,0.062683,0.458073,0.062562,0.062565,0.354117
4,0.502484,0.067217,0.067445,0.295285,0.067569


In [27]:
pd.merge(df, doc_top)

MergeError: ignored

In [19]:
doc_top.sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
40235    1.0
40236    1.0
40237    1.0
40238    1.0
40239    1.0
Length: 40240, dtype: float64

## Matriz tópicos-tokens

In [20]:
lda_model.components_

array([[0.20003306, 9.11176291, 4.07762402, ..., 0.21615756, 0.20097011,
        0.20145251],
       [2.04903392, 0.20652699, 0.21675812, ..., 6.47112853, 0.20072794,
        0.20068637],
       [0.20002956, 0.20379869, 0.20057681, ..., 0.20337748, 1.80184408,
        0.2014645 ],
       [0.20003373, 0.20223166, 0.22207763, ..., 0.20230281, 0.20002622,
        0.20508186],
       [0.20003115, 0.20180778, 0.20714152, ..., 0.20817562, 0.20002431,
        3.23219972]])

In [41]:
top_tokens= pd.DataFrame(lda_model.exp_dirichlet_component_,
                         columns=vocabulario)
print(top_tokens.shape)
top_tokens.head()

(5, 16579)


Unnamed: 0,0501,1000,10000,100000,100k,100m,100th,100th birthday,1080,10th,10yo,11th,12th,130,130 billion,13th,14th,14yo,14yo girl,150,150 million,1500,15th,15yo,16th,17th,17yo,18th,18yo,1970s,1980s,1983,1988,1990s,1995,1996,1999,19th,19yo,2000,...,zahra,zealand,zealand auckland,zealand australia,zealand coronavirus,zealand covid,zealand election,zealand host,zealand jacinda,zealand lift,zealand record,zealand travel,zealand vote,zealand white,zealand woman,zealander,zempilas,zero,zero carbon,zero case,zero coronavirus,zero covid,zero death,zero emission,zero local,ziffer,ziggy,ziggy ramo,zillmere,zillmere brawl,zimbabwe,zlate,zlate cvetanovski,zodiac,zombie,zone,zoo,zoom,zuckerberg,zverev
0,2.286448e-07,0.0003900503,0.0001624707,2.406738e-07,2.737546e-07,5.851247e-05,2.338646e-07,2.32755e-07,2.311711e-07,2.285429e-07,3.49894e-07,2.303602e-07,2.288257e-07,2.366014e-07,2.366014e-07,2.286472e-07,2.287862e-07,2.508619e-07,2.665478e-07,5.675037e-05,4.397578e-05,2.803134e-07,2.285444e-07,2.360641e-07,2.285925e-07,2.367225e-07,2.486037e-07,2.285483e-07,2.421351e-07,3.104442e-07,4.739532e-05,3.311473e-05,0.0001129386,2.286549e-07,6.046045e-05,2.299081e-07,2.638866e-07,2.811847e-07,5.132062e-05,2.426736e-07,...,5.046677e-05,2.594472e-07,2.316785e-07,8.6397e-05,2.304547e-07,2.28627e-07,2.29573e-07,2.434414e-07,2.314097e-07,2.286212e-07,2.285798e-07,2.378691e-07,2.365111e-07,2.30008e-07,2.349907e-07,2.363321e-07,4.912295e-05,2.380507e-07,2.449585e-07,2.328234e-07,2.290866e-07,2.286722e-07,2.603879e-07,2.340207e-07,2.286175e-07,2.297785e-07,2.343148e-07,2.343148e-07,0.0001782688,2.49398e-07,2.699803e-07,2.306239e-07,2.306239e-07,2.28661e-07,2.411602e-07,2.473867e-07,3.423362e-07,3.387741e-07,2.343149e-07,2.372684e-07
1,5.713565e-05,2.162577e-07,2.752854e-07,0.0001350408,1.976156e-07,1.942606e-07,9.132631e-05,7.477219e-05,1.832514e-07,1.831948e-07,6.316208e-05,1.975427e-07,6.144683e-05,3.517306e-05,3.517306e-05,2.050565e-07,1.84866e-07,8.050285e-05,5.2609e-05,1.93651e-07,1.990733e-07,1.944953e-07,1.831956e-07,1.921425e-07,1.937126e-07,2.296245e-07,1.882775e-07,1.831982e-07,1.855679e-07,1.862576e-07,1.925894e-07,1.924194e-07,1.896381e-07,2.016716e-07,2.152952e-07,1.86436e-07,2.590748e-07,1.874405e-07,1.833352e-07,0.0003289314,...,1.832219e-07,2.496565e-07,6.072428e-05,1.851099e-07,2.070598e-07,5.331767e-05,1.839982e-07,1.851894e-07,1.860551e-07,4.402226e-05,0.0001025721,2.089043e-07,1.83247e-07,4.976144e-05,1.859459e-07,2.2196e-07,1.832668e-07,2.297464e-07,1.832195e-07,2.046548e-07,0.0001004772,2.245654e-07,3.844311e-05,1.843312e-07,2.07412e-07,1.832614e-07,1.879249e-07,1.879249e-07,1.858173e-07,1.832406e-07,5.682577e-05,3.274243e-05,3.274243e-05,1.832957e-07,2.712625e-07,2.378071e-07,2.139289e-07,0.0002169345,1.866559e-07,1.864535e-07
2,2.044964e-07,2.253771e-07,2.074486e-07,2.225915e-07,2.091376e-07,2.18127e-07,2.240865e-07,2.216467e-07,2.126701e-07,9.840093e-05,2.179941e-07,0.0001078276,3.067811e-07,2.065507e-07,2.065507e-07,0.0001094681,2.990235e-07,2.068253e-07,2.057916e-07,2.105206e-07,2.09298e-07,2.241123e-07,0.0001208051,2.095189e-07,0.0001482759,0.0001075937,2.045364e-07,7.84153e-05,2.123365e-07,6.606152e-05,2.046673e-07,2.098424e-07,2.155743e-07,2.196816e-07,2.045626e-07,2.05954e-07,2.047048e-07,0.0001190951,2.067286e-07,2.25766e-07,...,2.044465e-07,2.152442e-07,2.044587e-07,2.202595e-07,2.138626e-07,2.080731e-07,2.077723e-07,2.100547e-07,2.048919e-07,2.060879e-07,2.044456e-07,2.101744e-07,2.088238e-07,2.044715e-07,2.100347e-07,2.055168e-07,2.067782e-07,0.001279159,3.547451e-05,0.0004115406,2.159718e-07,5.877832e-05,2.720541e-07,0.0001024482,8.703969e-05,0.000106791,2.044905e-07,2.044905e-07,2.083301e-07,2.044738e-07,2.197632e-07,2.065883e-07,2.065883e-07,7.056726e-05,2.046401e-07,0.0005468249,7.197047e-05,2.229798e-07,5.391498e-05,2.122946e-07
3,2.331657e-07,2.468705e-07,3.936884e-07,2.563348e-07,2.609231e-07,2.352784e-07,2.456233e-07,2.481829e-07,2.331358e-07,2.330603e-07,2.611397e-07,2.371325e-07,2.401435e-07,2.331605e-07,2.331605e-07,2.331698e-07,2.333199e-07,2.665347e-07,2.564076e-07,2.439546e-07,2.444e-07,0.0001150319,2.330626e-07,7.858483e-05,2.331123e-07,2.332333e-07,5.022052e-05,2.330663e-07,5.829484e-05,2.778797e-07,7.654813e-07,2.382012e-07,2.609154e-07,8.851199e-05,2.4289e-07,2.361443e-07,2.349002e-07,2.472742e-07,3.184691e-07,2.477454e-07,...,2.407261e-07,2.455954e-07,2.331142e-07,2.711245e-07,2.346077e-07,2.430118e-07,2.330712e-07,2.480915e-07,2.334872e-07,2.356704e-07,2.330974e-07,2.331712e-07,2.331441e-07,2.404601e-07,2.370566e-07,2.352372e-07,2.614115e-07,2.382785e-07,2.366687e-07,2.338283e-07,2.331003e-07,2.331947e-07,2.33292e-07,2.34214e-07,2.331372e-07,2.578351e-07,5.297888e-05,5.297888e-05,2.972183e-07,5.529366e-05,2.500762e-07,2.61569e-07,2.61569e-07,2.391486e-07,7.925653e-05,2.430388e-07,2.409397e-07,2.473224e-07,2.331197e-07,2.653854e-07
4,2.154443e-07,2.256449e-07,2.580827e-07,2.702449e-07,8.271356e-05,2.168211e-07,2.16852e-07,2.171276e-07,5.989604e-05,2.153532e-07,2.357651e-07,2.243688e-07,2.197469e-07,2.298592e-07,2.298592e-07,2.288222e-07,5.697816e-05,2.290591e-07,2.155015e-07,2.172934e-07,2.181828e-07,2.998718e-07,2.153547e-07,2.671395e-07,2.175872e-07,2.245196e-07,2.472249e-07,2.153582e-07,2.154261e-07,2.404772e-07,2.156264e-07,2.999181e-05,2.159803e-07,2.206859e-07,2.155415e-07,3.985681e-05,4.79134e-05,2.19322e-07,2.260558e-07,2.436389e-07,...,2.265264e-07,0.002622067,2.276636e-07,2.731502e-07,0.0002162276,2.351591e-07,6.802297e-05,4.929602e-05,9.825442e-05,2.364808e-07,2.268478e-07,0.0001528336,5.72623e-05,2.328129e-07,4.505119e-05,0.0001539672,2.218096e-07,2.286981e-07,2.153877e-07,2.202959e-07,2.153891e-07,2.154717e-07,2.169456e-07,2.239729e-07,2.16288e-07,2.154431e-07,2.174237e-07,2.174237e-07,2.188072e-07,2.199592e-07,2.234691e-07,2.154305e-07,2.154305e-07,2.16728e-07,2.156131e-07,2.38045e-07,2.460094e-07,2.646902e-07,2.154055e-07,0.0001171828


In [42]:
top_tokens.sum(axis=1)

0    0.808382
1    0.844104
2    0.830960
3    0.803892
4    0.820164
dtype: float64

## Perplejidad del modelo

In [None]:
%%time
lda_model.perplexity( BOW, sub_sampling = False)

CPU times: user 3.93 s, sys: 52.3 ms, total: 3.99 s
Wall time: 3.97 s


18307.9716880483

# Visualización del modelo

In [26]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Guardamos modelo

In [None]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
tuple_models = (lda_model, BOW, vectorizer)
pickle.dump(tuple_models, open (path + "tuple_model_news2020.pkl", 'wb'))

# Leemos el guardado modelo

In [16]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
lda_model, BOW, vectorizer = pickle.load(open(path + "tuple_model_news2020.pkl", 'rb'))
