<a href="https://colab.research.google.com/github/AnIsAsPe/LDA-TopicModeling_python/blob/main/Notebooks/LDA_con_sklearn_noticias_2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalar y cargar bibliotecas

In [1]:
!pip install pyLDAvis  #biblioteca que extrae información de un modelo LDA para obtener una visualización interactiva

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 19.2 MB/s eta 0:00:01[K     |▍                               | 20 kB 22.3 MB/s eta 0:00:01[K     |▋                               | 30 kB 12.2 MB/s eta 0:00:01[K     |▉                               | 40 kB 9.9 MB/s eta 0:00:01[K     |█                               | 51 kB 7.0 MB/s eta 0:00:01[K     |█▏                              | 61 kB 8.1 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.6 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.2 MB/s eta 0:00:01[K     |█▉                              | 92 kB 6.8 MB/s eta 0:00:01[K     |██                              | 102 kB 7.5 MB/s eta 0:00:01[K     |██▏                             | 112 kB 7.5 MB/s eta 0:00:01[K     |██▍                             | 122 kB 7.5 MB/s eta 0:00:01[K     |██▋                             | 133 kB 7.5 MB/s eta 0:00:01[K     |██

In [2]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import matplotlib.pyplot as plt 
import seaborn as sns  

nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer
nltk.download('punkt')


  from collections import Iterable
  from collections import Mapping


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Lectura y exploración de datos

En colab, al instalar la librería pyLDAvis causa un conflicto con Pandas. Si esto se refleja al correr la siguiente linea es necesario reiniciar el entorno de ejecución.

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Datos/ABC News/abcnews-date-text.csv', parse_dates=['publish_date'] )
print(df.shape)
df.head()

(1226258, 2)


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [4]:
df.publish_date.min(), df.publish_date.max()

(Timestamp('2003-02-19 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [5]:
# promedio de noticias por día
len(df)/(df.publish_date.max()-df.publish_date.min()).days


187.93226053639847

In [6]:

filtro = (df['publish_date']>= '2019-01-01')&(df['publish_date']<= '2019-12-31')
df = df[filtro]
df = df.reset_index(drop=True )
len(df)

34060

# Vectorización de textos

In [7]:
def preprocesar(texto):
  #convierte a minúsculas
  texto = (texto).lower()

  #elimina stopwords
  stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  texto = stop.sub('', texto) 

  #Quitar puntuación y números
  texto = re.sub('[^ña-z]+', ' ', texto)

  #lematizar y quedarnos con palabras que tengan más de tres caracteres
  lemmatizer = WordNetLemmatizer()
  texto = texto.split()
  texto = ' '.join([lemmatizer.lemmatize(i) for i in texto if len(i)>2])
  
  return(texto)

In [8]:
df['headline_pp']=df['headline_text'].apply(preprocesar)
df.head()

Unnamed: 0,publish_date,headline_text,headline_pp
0,2019-01-01,after expo ministers approved an artificial is...,expo minister approved artificial island
1,2019-01-01,alyssa healy named worlds best womens t20 play...,alyssa healy named world best woman player
2,2019-01-01,australia called a free rider on tackling clim...,australia called free rider tackling climate c...
3,2019-01-01,australia still has no us ambassador,australia still ambassador
4,2019-01-01,bangladesh ruling coalition declared winner of...,bangladesh ruling coalition declared winner di...


In [9]:
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,2))
BOW = vectorizer.fit_transform(df['headline_pp'])
BOW.shape

(34060, 15310)

In [10]:
vocabulario = vectorizer.get_feature_names_out()
len(vocabulario)

15310

In [11]:
list(vocabulario)[0:30]

['aaron',
 'aaron elli',
 'aaron finch',
 'ab',
 'ab data',
 'abalone',
 'abandon',
 'abandoned',
 'abares',
 'abares crop',
 'abattoir',
 'abbey',
 'abbey road',
 'abbot',
 'abbot point',
 'abbott',
 'abbott zali',
 'abbotts',
 'abc',
 'abc chair',
 'abc investigation',
 'abc managing',
 'abc news',
 'abc office',
 'abc presenter',
 'abc radio',
 'abc raid',
 'abc report',
 'abc reporter',
 'abc weather']

# Entrenamiento del modelo

In [12]:
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online',random_state=42,max_iter=50) 

In [13]:
%%time
lda_model.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

CPU times: user 3min 14s, sys: 1.13 s, total: 3min 15s
Wall time: 3min 16s


LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=5,
                          random_state=42)

### Distribución de temas en cada noticia  (O)

In [14]:
doc_top = pd.DataFrame(lda_model.transform(BOW))
print(doc_top.shape)
doc_top.head()

(34060, 5)


Unnamed: 0,0,1,2,3,4
0,0.033335,0.366004,0.033555,0.033335,0.533772
1,0.020001,0.020654,0.02005,0.319527,0.619768
2,0.699813,0.140241,0.119797,0.020097,0.020052
3,0.050313,0.05058,0.05,0.798849,0.050258
4,0.025001,0.025001,0.025308,0.776513,0.148176


In [15]:
doc_top.sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
34055    1.0
34056    1.0
34057    1.0
34058    1.0
34059    1.0
Length: 34060, dtype: float64

In [16]:
pd.merge(df, doc_top, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3,4
0,2019-01-01,after expo ministers approved an artificial is...,expo minister approved artificial island,0.033335,0.366004,0.033555,0.033335,0.533772
1,2019-01-01,alyssa healy named worlds best womens t20 play...,alyssa healy named world best woman player,0.020001,0.020654,0.020050,0.319527,0.619768
2,2019-01-01,australia called a free rider on tackling clim...,australia called free rider tackling climate c...,0.699813,0.140241,0.119797,0.020097,0.020052
3,2019-01-01,australia still has no us ambassador,australia still ambassador,0.050313,0.050580,0.050000,0.798849,0.050258
4,2019-01-01,bangladesh ruling coalition declared winner of...,bangladesh ruling coalition declared winner di...,0.025001,0.025001,0.025308,0.776513,0.148176
...,...,...,...,...,...,...,...,...
34055,2019-12-31,vision of flames approaching corryong in victoria,vision flame approaching corryong victoria,0.239760,0.040001,0.440168,0.040001,0.240071
34056,2019-12-31,wa police and government backflip on drug amne...,police government backflip drug amnesty bin,0.157379,0.274911,0.025073,0.517528,0.025110
34057,2019-12-31,we have fears for their safety: victorian premier,fear safety victorian premier,0.033334,0.034067,0.199996,0.366604,0.366000
34058,2019-12-31,when do the 20s start,start,0.100000,0.599999,0.100000,0.100000,0.100000


## Distribución de palabras en cada tema ($\mu$)

In [17]:
μs = pd.DataFrame(lda_model.exp_dirichlet_component_,
                         columns=vocabulario)
print(μs.shape)
μs.head()

(5, 15310)


Unnamed: 0,aaron,aaron elli,aaron finch,ab,ab data,abalone,abandon,abandoned,abares,abares crop,...,zipline,zlatan,zoe,zombie,zone,zone change,zoo,zoom,zuckerberg,zverev
0,1.165882e-07,1.142381e-07,1.041173e-07,1.128111e-07,1.050417e-07,1.074401e-07,1.099981e-07,0.000167048,0.0001401724,5.769569e-05,...,1.068557e-07,1.133271e-07,1.143372e-07,0.0002774461,0.0004424139,4.82528e-05,1.165312e-07,1.05378e-07,1.062255e-07,1.155185e-07
1,1.008305e-07,1.010258e-07,1.017063e-07,0.0002004628,1.018411e-07,1.080892e-07,0.0002389434,1.010924e-07,9.601201e-08,9.617893e-08,...,1.042736e-07,9.991715e-08,5.576734e-05,1.20893e-07,1.122073e-07,9.578523e-08,1.080009e-07,4.933455e-05,0.0001132896,1.53624e-05
2,1.00414e-07,9.837552e-08,9.873605e-08,1.025169e-07,9.923577e-08,7.615516e-05,1.130018e-07,0.0001135953,1.010884e-07,1.029202e-07,...,1.010977e-07,6.006067e-05,1.097432e-07,1.032753e-07,1.056264e-07,9.775623e-08,1.093966e-07,1.008333e-07,9.784042e-08,9.738872e-08
3,0.0002768259,7.213952e-05,6.643189e-05,1.062265e-07,9.763503e-08,1.062207e-07,1.087872e-07,1.198866e-07,9.818551e-08,9.764118e-08,...,1.033816e-07,1.026995e-07,1.017436e-07,1.030692e-07,1.069405e-07,9.726929e-08,0.0004039032,1.099081e-07,9.939553e-08,9.784671e-08
4,1.073122e-07,1.033457e-07,1.072838e-07,1.97616e-07,5.367641e-05,1.078152e-07,1.119844e-07,1.115558e-07,1.042809e-07,1.020046e-07,...,5.731894e-05,1.020243e-07,1.403873e-07,1.174925e-07,1.148552e-07,1.082318e-07,1.067244e-07,1.034218e-07,1.058423e-07,2.502929e-05


In [18]:
μs.sum(axis=1)

0    0.917485
1    0.925248
2    0.923990
3    0.924217
4    0.920748
dtype: float64

# Visualización del modelo

In [19]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Guardamos modelo

In [20]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
tuple_models = (lda_model, BOW, vectorizer)
pickle.dump(tuple_models, open (path + "tuple_model_news2019.pkl", 'wb'))

# Leemos el guardado modelo

In [21]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
lda_model, BOW, vectorizer = pickle.load(open(path + "tuple_model_news2019.pkl", 'rb'))


# Entrenamiento del modelo 2

In [22]:
lda_model2=LatentDirichletAllocation(n_components=4,learning_method='online',random_state=42,max_iter=50) 

In [23]:
%%time
lda_model2.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

CPU times: user 3min 9s, sys: 1.28 s, total: 3min 10s
Wall time: 3min 10s


LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=4,
                          random_state=42)

### Distribución de temas en cada noticia  (O)

In [24]:
doc_top = pd.DataFrame(lda_model2.transform(BOW))
print(doc_top.shape)
doc_top.head()

(34060, 4)


Unnamed: 0,0,1,2,3
0,0.041671,0.874987,0.041671,0.041671
1,0.02556,0.490073,0.025004,0.459363
2,0.805412,0.026529,0.141892,0.026167
3,0.062763,0.064329,0.062501,0.810407
4,0.031256,0.354974,0.03336,0.58041


In [25]:
doc_top.sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
34055    1.0
34056    1.0
34057    1.0
34058    1.0
34059    1.0
Length: 34060, dtype: float64

In [26]:
pd.merge(df, doc_top, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3
0,2019-01-01,after expo ministers approved an artificial is...,expo minister approved artificial island,0.041671,0.874987,0.041671,0.041671
1,2019-01-01,alyssa healy named worlds best womens t20 play...,alyssa healy named world best woman player,0.025560,0.490073,0.025004,0.459363
2,2019-01-01,australia called a free rider on tackling clim...,australia called free rider tackling climate c...,0.805412,0.026529,0.141892,0.026167
3,2019-01-01,australia still has no us ambassador,australia still ambassador,0.062763,0.064329,0.062501,0.810407
4,2019-01-01,bangladesh ruling coalition declared winner of...,bangladesh ruling coalition declared winner di...,0.031256,0.354974,0.033360,0.580410
...,...,...,...,...,...,...,...
34055,2019-12-31,vision of flames approaching corryong in victoria,vision flame approaching corryong victoria,0.401907,0.050004,0.279421,0.268668
34056,2019-12-31,wa police and government backflip on drug amne...,police government backflip drug amnesty bin,0.032203,0.656093,0.031252,0.280452
34057,2019-12-31,we have fears for their safety: victorian premier,fear safety victorian premier,0.041667,0.874828,0.041668,0.041837
34058,2019-12-31,when do the 20s start,start,0.125002,0.624994,0.125002,0.125002


## Distribución de palabras en cada tema ($\mu$)

In [27]:
μs = pd.DataFrame(lda_model2.exp_dirichlet_component_,
                         columns=vocabulario)
print(μs.shape)
μs.head()

(4, 15310)


Unnamed: 0,aaron,aaron elli,aaron finch,ab,ab data,abalone,abandon,abandoned,abares,abares crop,...,zipline,zlatan,zoe,zombie,zone,zone change,zoo,zoom,zuckerberg,zverev
0,2.956196e-07,2.96465e-07,2.256042e-07,2.332032e-07,2.447474e-07,2.558697e-07,0.0001897022,0.0002187125,0.0001043556,4.332911e-05,...,3.907076e-07,2.464007e-07,4.523316e-05,0.000149583,0.0003278381,3.640268e-05,2.555242e-07,2.444664e-07,2.259424e-07,2.287317e-07
1,2.469087e-07,2.29726e-07,2.741597e-07,0.0001624298,4.09888e-05,2.604014e-07,2.241632e-06,2.455758e-07,2.336722e-07,2.291994e-07,...,4.262101e-05,2.554289e-07,3.320083e-07,5.291372e-05,3.783016e-07,2.395008e-07,0.0003211833,4.052326e-05,4.107143e-07,2.579677e-07
2,2.589788e-07,2.743651e-07,2.455775e-07,2.954051e-07,2.630407e-07,6.674072e-05,3.720522e-07,2.754605e-07,2.834117e-07,2.770335e-07,...,4.831776e-07,5.283615e-05,3.7263e-07,2.773253e-07,3.197514e-07,2.46962e-07,3.187326e-07,2.44616e-07,0.0001002482,3.907181e-05
3,0.0002261288,5.92652e-05,5.478419e-05,2.729401e-07,2.531924e-07,2.472064e-07,2.850546e-07,2.96976e-07,2.568277e-07,2.465316e-07,...,2.705113e-07,2.282317e-07,2.794262e-07,2.639823e-07,2.653619e-07,2.451719e-07,2.587686e-07,2.282723e-07,3.101984e-07,2.332814e-07


In [28]:
μs.sum(axis=1)

0    0.927463
1    0.927860
2    0.922533
3    0.925351
dtype: float64

# Visualización del modelo 2

In [30]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model2, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
