<a href="https://colab.research.google.com/github/AnIsAsPe/LDA-TopicModeling_python/blob/main/Notebooks/LDA_con_sklearn_noticias_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalar y cargar bibliotecas

In [1]:
!pip install pyLDAvis  #biblioteca que extrae información de un modelo LDA para obtener una visualización interactiva

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 3.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=19eb2154b0317abff8d16cfb4e7f2d58381eb4cb742997350ceef756a5de6d1f
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1


In [2]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import matplotlib.pyplot as plt 
import seaborn as sns  

nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer
nltk.download('punkt')


  from collections import Iterable
  from collections import Mapping


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Lectura y exploración de datos

En colab, al instalar la librería pyLDAvis causa un conflicto con Pandas. Si esto se refleja al correr la siguiente linea es necesario reiniciar el entorno de ejecución.

In [14]:
df = pd.read_csv('/content/drive/MyDrive/Datos/ABC News/abcnews-date-text.csv', parse_dates=['publish_date'] )
print(df.shape)
df.head()

(1226258, 2)


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [15]:
df.publish_date.min(), df.publish_date.max()

(Timestamp('2003-02-19 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [16]:
# promedio de noticias por día
len(df)/(df.publish_date.max()-df.publish_date.min()).days


187.93226053639847

In [17]:

filtro = (df['publish_date']>= '2020-01-01')&(df['publish_date']<= '2020-12-31')
df = df[filtro]
df = df.reset_index(drop=True )
len(df)

40240

# Vectorización de textos

In [27]:
def preprocesar(texto):
  #convierte a minúsculas
  texto = (texto).lower()

  #elimina stopwords
  stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  texto = stop.sub('', texto) 

  #Quitar puntuación y números
  texto = re.sub('[^ña-z]+', ' ', texto)

  #lematizar y quedarnos con palabras que tengan más de tres caracteres
  lemmatizer = WordNetLemmatizer()
  texto = texto.split()
  texto = ' '.join([lemmatizer.lemmatize(i) for i in texto if len(i)>2])
  
  return(texto)

In [29]:
df['headline_pp']=df['headline_text'].apply(preprocesar)
df.head()

Unnamed: 0,publish_date,headline_text,headline_pp
0,2020-01-01,a new type of resolution for the new year,new type resolution new year
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight new year eve f...
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...


In [30]:
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,2))
BOW = vectorizer.fit_transform(df['headline_pp'])
BOW.shape

(40240, 17889)

In [31]:
vocabulario = vectorizer.get_feature_names_out()
len(vocabulario)

17889

In [32]:
list(vocabulario)[0:30]

['aacta',
 'aaron',
 'ab',
 'abandon',
 'abandoned',
 'abandoned baby',
 'abattoir',
 'abbott',
 'abc',
 'abc analyst',
 'abc china',
 'abc coronavirus',
 'abc heywire',
 'abc investigation',
 'abc journalist',
 'abc news',
 'abc radio',
 'abc reader',
 'abc reporter',
 'abc rural',
 'abc tasmania',
 'abducted',
 'abduction',
 'abdul',
 'abe',
 'abf',
 'able',
 'ablett',
 'aboard',
 'aboriginal']

# Entrenamiento del modelo

In [33]:
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online',random_state=42,max_iter=50) 

In [34]:
%%time
lda_model.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

CPU times: user 4min 22s, sys: 1.52 s, total: 4min 23s
Wall time: 4min 25s


LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=5,
                          random_state=42)

### Distribución de temas en cada noticia  (O)

In [35]:
doc_top = pd.DataFrame(lda_model.transform(BOW))
print(doc_top.shape)
doc_top.head()

(40240, 5)


Unnamed: 0,0,1,2,3,4
0,0.199279,0.028573,0.184108,0.028812,0.559229
1,0.025041,0.025081,0.025191,0.025001,0.899686
2,0.693037,0.01693,0.017398,0.112981,0.159655
3,0.365887,0.532855,0.034198,0.033334,0.033726
4,0.239982,0.439999,0.04,0.04,0.240019


In [36]:
doc_top.sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
40235    1.0
40236    1.0
40237    1.0
40238    1.0
40239    1.0
Length: 40240, dtype: float64

In [37]:
pd.merge(df, doc_top, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3,4
0,2020-01-01,a new type of resolution for the new year,new type resolution new year,0.199279,0.028573,0.184108,0.028812,0.559229
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade,0.025041,0.025081,0.025191,0.025001,0.899686
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight new year eve f...,0.693037,0.016930,0.017398,0.112981,0.159655
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank,0.365887,0.532855,0.034198,0.033334,0.033726
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...,0.239982,0.439999,0.040000,0.040000,0.240019
...,...,...,...,...,...,...,...,...
40235,2020-12-31,what abc readers learned from 2020 looking bac...,abc reader learned looking back year,0.020312,0.143208,0.020001,0.696724,0.119755
40236,2020-12-31,what are the south african and uk variants of ...,south african variant covid,0.033334,0.366653,0.034192,0.366597,0.199225
40237,2020-12-31,what victorias coronavirus restrictions mean f...,victoria coronavirus restriction mean new year...,0.374914,0.015385,0.015421,0.578345,0.015936
40238,2020-12-31,whats life like as an american doctor during c...,whats life like american doctor covid,0.149999,0.399955,0.025230,0.274785,0.150030


## Distribución de palabras en cada tema ($\mu$)

In [47]:
μs = pd.DataFrame(lda_model.exp_dirichlet_component_,
                         columns=vocabulario)
print(μs.shape)
μs.head()

(5, 17889)


Unnamed: 0,aacta,aaron,ab,abandon,abandoned,abandoned baby,abattoir,abbott,abc,abc analyst,...,zlate,zlate cvetanovski,zodiac,zoe,zombie,zone,zoo,zoom,zuckerberg,zverev
0,8.055449e-08,8.394868e-08,8.290329e-08,8.59712e-08,0.0002583584,4.856293e-05,8.847004e-08,0.0001606078,9.363507e-08,7.99874e-08,...,8.007074e-08,8.007074e-08,8.259616e-08,8.386654e-08,5.527396e-05,1.010441e-07,8.700062e-08,0.0002163308,7.941396e-08,8.010021e-08
1,5.479865e-05,9.063174e-08,8.905556e-08,9.687651e-08,9.750708e-08,9.057895e-08,0.0001858361,9.396337e-08,0.002232402,8.905083e-08,...,8.799667e-08,8.799667e-08,5.563037e-05,9.990114e-08,9.725587e-08,9.643613e-08,9.649468e-08,1.009692e-07,4.355457e-05,0.0001318067
2,8.937147e-08,4.060349e-05,8.383059e-08,0.000145318,9.298521e-08,9.201562e-08,9.192969e-08,8.576488e-08,8.999806e-08,8.442723e-08,...,9.009405e-08,9.009405e-08,8.384335e-08,4.324443e-05,8.518493e-08,0.0006206949,0.0005930656,9.235608e-08,8.460164e-08,8.441493e-08
3,6.273702e-08,6.972041e-08,0.0002316852,7.293856e-08,7.287104e-08,6.769409e-08,0.0001518431,6.654507e-08,0.001102018,4.525883e-05,...,6.478957e-08,6.478957e-08,6.27436e-08,6.877298e-08,6.273777e-08,7.352419e-08,6.603361e-08,6.68622e-08,7.040503e-08,6.2908e-08
4,7.476241e-08,7.598304e-08,7.571805e-08,7.872401e-08,8.212703e-08,7.659022e-08,8.005516e-08,7.808353e-08,8.047507e-08,7.373083e-08,...,3.923673e-05,3.923673e-05,7.70101e-08,7.356924e-08,7.502965e-08,8.437525e-08,8.074065e-08,7.812199e-08,7.355545e-08,7.354552e-08


In [48]:
μs.sum(axis=1)

0    0.927392
1    0.919499
2    0.923305
3    0.943717
4    0.931934
dtype: float64

# Visualización del modelo

In [49]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Guardamos modelo

In [50]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
tuple_models = (lda_model, BOW, vectorizer)
pickle.dump(tuple_models, open (path + "tuple_model_news2020.pkl", 'wb'))

# Leemos el guardado modelo

In [51]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
lda_model, BOW, vectorizer = pickle.load(open(path + "tuple_model_news2020.pkl", 'rb'))


# Entrenamiento del modelo 2

In [55]:
lda_model2=LatentDirichletAllocation(n_components=4,learning_method='online',random_state=42,max_iter=50) 

In [56]:
%%time
lda_model2.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

CPU times: user 4min 13s, sys: 1.61 s, total: 4min 15s
Wall time: 4min 18s


LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=4,
                          random_state=42)

### Distribución de temas en cada noticia  (O)

In [57]:
doc_top = pd.DataFrame(lda_model2.transform(BOW))
print(doc_top.shape)
doc_top.head()

(40240, 4)


Unnamed: 0,0,1,2,3
0,0.370215,0.035973,0.298384,0.295428
1,0.156453,0.031325,0.780589,0.031634
2,0.021096,0.020916,0.139193,0.818795
3,0.041673,0.407005,0.04295,0.508373
4,0.050425,0.05035,0.849224,0.050001


In [58]:
doc_top.sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
40235    1.0
40236    1.0
40237    1.0
40238    1.0
40239    1.0
Length: 40240, dtype: float64

In [59]:
pd.merge(df, doc_top, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3
0,2020-01-01,a new type of resolution for the new year,new type resolution new year,0.370215,0.035973,0.298384,0.295428
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade,0.156453,0.031325,0.780589,0.031634
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight new year eve f...,0.021096,0.020916,0.139193,0.818795
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank,0.041673,0.407005,0.042950,0.508373
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...,0.050425,0.050350,0.849224,0.050001
...,...,...,...,...,...,...,...
40235,2020-12-31,what abc readers learned from 2020 looking bac...,abc reader learned looking back year,0.025003,0.123725,0.025499,0.825773
40236,2020-12-31,what are the south african and uk variants of ...,south african variant covid,0.374712,0.374964,0.208652,0.041672
40237,2020-12-31,what victorias coronavirus restrictions mean f...,victoria coronavirus restriction mean new year...,0.535564,0.019255,0.019901,0.425280
40238,2020-12-31,whats life like as an american doctor during c...,whats life like american doctor covid,0.447155,0.241130,0.031254,0.280461


## Distribución de palabras en cada tema ($\mu$)

In [60]:
μs = pd.DataFrame(lda_model2.exp_dirichlet_component_,
                         columns=vocabulario)
print(μs.shape)
μs.head()

(4, 17889)


Unnamed: 0,aacta,aaron,ab,abandon,abandoned,abandoned baby,abattoir,abbott,abc,abc analyst,...,zlate,zlate cvetanovski,zodiac,zoe,zombie,zone,zoo,zoom,zuckerberg,zverev
0,1.342399e-07,1.378323e-07,0.0001717455,1.59756e-07,0.0001508602,1.647154e-07,9.427261e-05,9.402175e-05,1.640185e-07,1.456304e-07,...,1.341521e-07,1.341521e-07,1.375759e-07,1.465176e-07,1.378864e-07,1.977326e-07,1.733638e-07,7.698171e-05,1.438595e-07,1.346116e-07
1,2.931848e-07,3.568762e-05,2.213863e-07,0.0001256446,4.679033e-07,4.485982e-05,0.0001833349,2.354529e-07,2.809044e-07,5.280814e-05,...,2.377463e-07,2.377463e-07,4.676785e-05,2.196424e-07,3.343146e-07,0.0005352001,2.611033e-07,7.399884e-05,3.659349e-05,0.0001099895
2,4.147231e-05,2.057025e-07,1.923475e-07,2.123087e-07,2.406799e-07,1.890966e-07,2.351272e-07,2.206493e-07,2.448707e-07,1.889143e-07,...,1.889621e-07,1.889621e-07,1.962005e-07,3.434275e-05,2.398229e-07,4.359226e-07,0.0004625539,2.357065e-07,1.890898e-07,1.93173e-07
3,1.951538e-07,2.1473e-07,2.048794e-07,3.004904e-07,2.115837e-07,2.022671e-07,2.547691e-07,2.712124e-07,0.002914184,1.998794e-07,...,3.658458e-05,3.658458e-05,2.117569e-07,2.056823e-07,4.704205e-05,2.175194e-07,2.232687e-07,2.686084e-07,2.571724e-07,1.980412e-07


In [61]:
μs.sum(axis=1)

0    0.948750
1    0.921152
2    0.927662
3    0.927779
dtype: float64

# Visualización del modelo 2

In [62]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
