<a href="https://colab.research.google.com/github/AnIsAsPe/LDA-TopicModeling_python/blob/main/Notebooks/LDA_con_sklearn_noticias_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalar y cargar bibliotecas

In [1]:
!pip install pyLDAvis  #biblioteca que extrae información de un modelo LDA para obtener una visualización interactiva

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 6.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting numpy>=1.20.0
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 27.4 MB/s 
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting pandas>=1.2.0
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 25.9 MB/s 
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136897 sha256=cc1b6a111c7aa

In [15]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import matplotlib.pyplot as plt 
import seaborn as sns  

nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Lectura y exploración de datos

En colab, al instalar la librería pyLDAvis causa un conflicto con Pandas. Si esto se refleja al correr la siguiente linea es necesario reiniciar el entorno de ejecución.

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Datos/abcnews-date-text.csv', parse_dates=['publish_date'] )
print(df.shape)
df.head()

(1226258, 2)


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [3]:
df.publish_date.min(), df.publish_date.max()

(Timestamp('2003-02-19 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [5]:
# promedio de noticias por día
len(df)/(df.publish_date.max()-df.publish_date.min()).days


187.93226053639847

In [6]:

filtro = (df['publish_date']>= '2020-01-01')&(df['publish_date']<= '2020-12-31')
df = df[filtro]
df = df.reset_index(drop=True )
len(df)

40240

# Vectorización de textos

In [7]:
def preprocesar(texto):
  #convierte a minúsculas
  texto = (texto).lower()

  #elimina stopwords
  stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  texto = stop.sub('', texto) 

  #lematizar y quedarnos con palabras que tengan más de tres caracteres
  lemmatizer = WordNetLemmatizer()
  texto = texto.split()
  texto = ' '.join([lemmatizer.lemmatize(i) for i in texto if len(i)>3])
  
  return(texto)

In [8]:
%%time
df['headline_pp']=df['headline_text'].apply(preprocesar)
df.head()

CPU times: user 8.36 s, sys: 615 ms, total: 8.98 s
Wall time: 8.95 s


In [9]:
df.head()

Unnamed: 0,publish_date,headline_text,headline_pp
0,2020-01-01,a new type of resolution for the new year,type resolution year
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight year firework
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...


In [10]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1,2))
BOW = vectorizer.fit_transform(df['headline_pp'])
BOW.shape

(40240, 16579)

In [12]:
vocabulario = vectorizer.get_feature_names_out()
len(vocabulario)

16579

In [14]:
list(vocabulario)

['0501',
 '1000',
 '10000',
 '100000',
 '100k',
 '100m',
 '100th',
 '100th birthday',
 '1080',
 '10th',
 '10yo',
 '11th',
 '12th',
 '130',
 '130 billion',
 '13th',
 '14th',
 '14yo',
 '14yo girl',
 '150',
 '150 million',
 '1500',
 '15th',
 '15yo',
 '16th',
 '17th',
 '17yo',
 '18th',
 '18yo',
 '1970s',
 '1980s',
 '1983',
 '1988',
 '1990s',
 '1995',
 '1996',
 '1999',
 '19th',
 '19yo',
 '2000',
 '2000 olympics',
 '20000',
 '200000',
 '2006',
 '2008',
 '200m',
 '2010',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2020 australia',
 '2020 carpet',
 '2020 coronavirus',
 '2020 covid',
 '2020 election',
 '2020 grand',
 '2020 labor',
 '2020 live',
 '2020 need',
 '2020 nobel',
 '2020 olympics',
 '2020 oscar',
 '2020 presidential',
 '2020 season',
 '2020 vote',
 '2020 winner',
 '2020 year',
 '2021',
 '2021 coronavirus',
 '2021 election',
 '2021 season',
 '2022',
 '2023',
 '2024',
 '2025',
 '2050',
 '20th',
 '20yo',
 '21st',
 '23rd',
 '24th',
 '2500',
 '25000',
 '25km',


# Entrenamiento del modelo

In [14]:

lda_model=LatentDirichletAllocation( n_components=5,learning_method='online',random_state=42,max_iter=50) 

In [17]:
# %%time
# lda_model.fit(BOW) # entrena el modelo y obtienela matriz documento-topico

### Matriz documento tópico

In [18]:
doc_top = pd.DataFrame(lda_model.transform(BOW))
print(doc_top.shape)
doc_top.head()

(40240, 5)


Unnamed: 0,0,1,2,3,4
0,0.39443,0.380654,0.074977,0.074743,0.075196
1,0.199485,0.630972,0.056487,0.056418,0.056639
2,0.165149,0.666773,0.056001,0.055938,0.056139
3,0.062683,0.458073,0.062562,0.062565,0.354117
4,0.502484,0.067217,0.067445,0.295285,0.067569


In [20]:
pd.merge(df, doc_top, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3,4
0,2020-01-01,a new type of resolution for the new year,type resolution year,0.394430,0.380654,0.074977,0.074743,0.075196
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade,0.199485,0.630972,0.056487,0.056418,0.056639
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight year firework,0.165149,0.666773,0.056001,0.055938,0.056139
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank,0.062683,0.458073,0.062562,0.062565,0.354117
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...,0.502484,0.067217,0.067445,0.295285,0.067569
...,...,...,...,...,...,...,...,...
40235,2020-12-31,what abc readers learned from 2020 looking bac...,reader learned 2020 looking back year,0.386176,0.183985,0.322685,0.053479,0.053675
40236,2020-12-31,what are the south african and uk variants of ...,south african variant covid,0.063945,0.309327,0.498443,0.063974,0.064311
40237,2020-12-31,what victorias coronavirus restrictions mean f...,victoria coronavirus restriction mean year,0.054950,0.660306,0.175077,0.054712,0.054956
40238,2020-12-31,whats life like as an american doctor during c...,whats life like american doctor covid,0.493583,0.335155,0.058012,0.056906,0.056344


In [None]:
doc_top.sum(axis=1)

## Matriz tópicos-tokens

In [24]:
(lda_model.components_).shape

(5, 16579)

In [None]:
top_tokens= pd.DataFrame(lda_model.exp_dirichlet_component_,
                         columns=vocabulario)
print(top_tokens.shape)
top_tokens.head()

In [None]:
top_tokens.sum(axis=1)

## Perplejidad del modelo

In [None]:
%%time
lda_model.perplexity( BOW, sub_sampling = False)

# Visualización del modelo

In [25]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Guardamos modelo

In [None]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
tuple_models = (lda_model, BOW, vectorizer)
pickle.dump(tuple_models, open (path + "tuple_model_news2020.pkl", 'wb'))

# Leemos el guardado modelo

In [17]:
import pickle
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
lda_model, BOW, vectorizer = pickle.load(open(path + "tuple_model_news2020.pkl", 'rb'))


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
