<a href="https://colab.research.google.com/github/AnIsAsPe/LDATopicModeling_pyspark/blob/main/LDA_con_pySpark_ngrams_lematizacion_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalación de PySpark en Colab

In [1]:
!pip install pyspark



In [1]:
import os
os.cpu_count()

4

In [2]:
!echo $(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024)))

26070


## Crear Sesión Colab

In [3]:
# Crear una sesión de spark

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [4]:
spark

## Importar bibliotecas 

In [5]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, concat, split, col
from pyspark.ml.feature import RegexTokenizer, NGram, VectorAssembler, CountVectorizer, IDF
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import  CountVectorizerModel
from pyspark.ml.clustering import LocalLDAModel
from nltk.stem import WordNetLemmatizer


import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Leer los datos

In [6]:
df = spark.read.csv("/content/drive/MyDrive/Datos/abcnews-date-text.csv",header=True)

print('Cantidad de renglones: ', df.count())

Cantidad de renglones:  1226258


In [7]:
df.printSchema() # explorar la estructura el dataframe

root
 |-- publish_date: string (nullable = true)
 |-- headline_text: string (nullable = true)



In [8]:
df.head(3)

[Row(publish_date='20030219', headline_text='aba decides against community broadcasting licence'),
 Row(publish_date='20030219', headline_text='act fire witnesses must be aware of defamation'),
 Row(publish_date='20030219', headline_text='a g calls for infrastructure protection summit')]

In [9]:
df.show() # default 20 renglones

+------------+--------------------+
|publish_date|       headline_text|
+------------+--------------------+
|    20030219|aba decides again...|
|    20030219|act fire witnesse...|
|    20030219|a g calls for inf...|
|    20030219|air nz staff in a...|
|    20030219|air nz strike to ...|
|    20030219|ambitious olsson ...|
|    20030219|antic delighted w...|
|    20030219|aussie qualifier ...|
|    20030219|aust addresses un...|
|    20030219|australia is lock...|
|    20030219|australia to cont...|
|    20030219|barca take record...|
|    20030219|bathhouse plans m...|
|    20030219|big hopes for lau...|
|    20030219|big plan to boost...|
|    20030219|blizzard buries u...|
|    20030219|brigadier dismiss...|
|    20030219|british combat tr...|
|    20030219|bryant leads lake...|
|    20030219|bushfire victims ...|
+------------+--------------------+
only showing top 20 rows



In [10]:
df.select('headline_text').take(1)

[Row(headline_text='aba decides against community broadcasting licence')]

In [11]:
# accedemos al texto del primer registro
df.select('headline_text').take(1)[0][0]

'aba decides against community broadcasting licence'

In [12]:
type(df)

pyspark.sql.dataframe.DataFrame

Queremos un inice consecutivo, para ello vamos a utilizar solo la columna 'hedline_text' y usando rdd creamos el indice

In [14]:
texts = df.rdd.map(lambda x: x['headline_text'])
headlines = texts.zipWithIndex( )   
type(headlines)

pyspark.rdd.PipelinedRDD

In [17]:
data = spark.createDataFrame(headlines, ["headlines",'index'])
data.show()

+--------------------+-----+
|           headlines|index|
+--------------------+-----+
|aba decides again...|    0|
|act fire witnesse...|    1|
|a g calls for inf...|    2|
|air nz staff in a...|    3|
|air nz strike to ...|    4|
|ambitious olsson ...|    5|
|antic delighted w...|    6|
|aussie qualifier ...|    7|
|aust addresses un...|    8|
|australia is lock...|    9|
|australia to cont...|   10|
|barca take record...|   11|
|bathhouse plans m...|   12|
|big hopes for lau...|   13|
|big plan to boost...|   14|
|blizzard buries u...|   15|
|brigadier dismiss...|   16|
|british combat tr...|   17|
|bryant leads lake...|   18|
|bushfire victims ...|   19|
+--------------------+-----+
only showing top 20 rows



# Preprocesar Texto

## Normalizar (minusculas y quitar puntuación)

In [18]:
# Creamos una función para quitar puntuación (usf -User Define Function )
removePunct = udf(lambda s: s.strip().lower(), StringType())

data_norm = data.withColumn("text", removePunct(data.headlines))

## Tokenizar

In [19]:
# tokenizar
tokenizer = RegexTokenizer(inputCol="text", outputCol="words",
                           gaps=True, pattern=r'\s+', minTokenLength=4)
df_tokens = tokenizer.transform(data_norm)

In [20]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+
|           headlines|index|                text|               words|
+--------------------+-----+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|
|antic delighted w...|    6|antic delighted w...|[antic, delighted...|
|aussie qualifier ...|    7|aussie qualifier ...|[aussie, qualifie...|
|aust addresses un...|    8|aust addresses un...|[aust, addresses,...|
|australia is lock...|    9|australia is lock...|[australia, locke...|
|australia to cont...|   10|australia to cont...|[australia, contr...|
|barca

## Quitar palabras vacías

In [21]:
stopwords = stopwords.words("english")
removeStop=udf(lambda word: [x for x in word if x not in stopwords])
df_tokens=df_tokens.withColumn('noStopWords',removeStop(df_tokens['words']))

In [22]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+--------------------+
|           headlines|index|                text|               words|         noStopWords|
+--------------------+-----+--------------------+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|[decides, communi...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|[fire, witnesses,...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|[calls, infrastru...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|[ambitious, olsso...|
|antic delighted w...|    6|antic delighted w...|[antic, delighted...|[antic, delighted...|
|aussie qualifier ...|    7|aussie qualifier ...|[aussie, qualifie...|[aussie, q

## Lematización

In [28]:
lemma = WordNetLemmatizer()
def lematizacion(in_vec):
    out_vec = [lemma.lemmatize(w) for w in in_vec]
    return out_vec

lemma_udf = udf(lambda x:lematizacion(x),ArrayType(StringType()))
df_tokens=df_tokens.withColumn('finalwords',lemma_udf(df_tokens['noStopWords']))


In [29]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|           headlines|index|                text|               words|         noStopWords|          finalwords|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|[decides, communi...|[decides, communi...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|[fire, witnesses,...|[fire, witness, m...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|[calls, infrastru...|[call, infrastruc...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|[staff, aust, str...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|[strike, affect, ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|[ambitious, olsso...|[ambi

###N-Grams

## Vectorizar con unigramas y bigramas

En pyspark primero tenemos que crear todos los n-gramas que nos interesen y luego utilizar el modelo CountVectorizer y unirlo todo utilizando VectorAssembler

In [31]:
def build_ngrams(inputCol="finalwords", n=2):

    ngrams = [
        NGram(n=i, inputCol="finalwords", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i), minDF=20)
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="features_cv"
    )]

    return Pipeline(stages=ngrams + vectorizers + assembler)

In [32]:
%%time
# TF
pipline_vectorizer_ngrams = build_ngrams().fit(df_tokens)


CPU times: user 706 ms, sys: 85.7 ms, total: 792 ms
Wall time: 2min 21s


In [33]:
pipline_vectorizer_ngrams.stages

[NGram_60119ec6142e,
 NGram_d737ad96255a,
 CountVectorizerModel: uid=CountVectorizer_f40d5801c5bf, vocabularySize=16599,
 CountVectorizerModel: uid=CountVectorizer_a612b2d3de70, vocabularySize=19985,
 VectorAssembler_255725244f3f]

In [34]:
vectorizers = [s for s in pipline_vectorizer_ngrams.stages if isinstance(s, CountVectorizerModel)]
vocabArray = [v.vocabulary for v in vectorizers]
len(vocabArray)

2

In [35]:
# Con cuantas palabras y bigramas nos quedamos
palabras=vocabArray[0]
biGramas=vocabArray[1]

In [36]:
# muestra de bigramas
biGramas[0:20]

['gold coast',
 'country hour',
 'donald trump',
 'face court',
 'pleads guilty',
 'asylum seeker',
 'mental health',
 'climate change',
 'police investigate',
 'north korea',
 'police probe',
 'broken hill',
 'share market',
 'rate rise',
 'royal commission',
 'police officer',
 'plane crash',
 'body found',
 'front court',
 'govt urged']

In [38]:
len(palabras), len(biGramas)

(16599, 19985)

In [39]:
vocabulario = palabras + biGramas
len(vocabulario)

36584

__ahora hacemos la transformación con la vectorización hecha para obtener TF__

In [41]:
result_cv = pipline_vectorizer_ngrams.transform(df_tokens)

In [42]:
result_cv.columns  # la vectorización se encuentra en la última columna

['headlines',
 'index',
 'text',
 'words',
 'noStopWords',
 'finalwords',
 '1_grams',
 '2_grams',
 '1_counts',
 '2_counts',
 'features_cv']

In [44]:
result_cv.take(1)

[Row(headlines='aba decides against community broadcasting licence', index=0, text='aba decides against community broadcasting licence', words=['decides', 'against', 'community', 'broadcasting', 'licence'], noStopWords='[decides, community, broadcasting, licence]', finalwords=['decides', 'community', 'broadcasting', 'licence'], 1_grams=['decides', 'community', 'broadcasting', 'licence'], 2_grams=['decides community', 'community broadcasting', 'broadcasting licence'], 1_counts=SparseVector(16599, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), 2_counts=SparseVector(19985, {}), features_cv=SparseVector(36584, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}))]

__es el turno de obtener IDF__

In [45]:
%%time
# IDF
idf = IDF(inputCol="features_cv", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

CPU times: user 338 ms, sys: 41.8 ms, total: 380 ms
Wall time: 1min 11s


In [46]:
result_tfidf.take(1)

[Row(headlines='aba decides against community broadcasting licence', index=0, text='aba decides against community broadcasting licence', words=['decides', 'against', 'community', 'broadcasting', 'licence'], noStopWords='[decides, community, broadcasting, licence]', finalwords=['decides', 'community', 'broadcasting', 'licence'], 1_grams=['decides', 'community', 'broadcasting', 'licence'], 2_grams=['decides community', 'community broadcasting', 'broadcasting licence'], 1_counts=SparseVector(16599, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), 2_counts=SparseVector(19985, {}), features_cv=SparseVector(36584, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), features=SparseVector(36584, {111: 5.2528, 958: 6.8735, 5213: 9.0088, 8099: 9.729}))]

## Entrenamiento del Modelo

In [47]:
%%time
num_topics=30
max_iterations=50
lda = LDA(k=num_topics, maxIter=max_iterations)
ldaModel = lda.fit(result_tfidf)

CPU times: user 3.68 s, sys: 467 ms, total: 4.15 s
Wall time: 12min 39s


### Guardar los modelos

In [52]:
# print(ldaModel.isDistributed())
# path = "/content/drive/MyDrive/Modelos/modelosLDA/"

# model_number = '2'
# pipline_vectorizer_ngrams.save(path + 'PipelineVectorizerModel'+ model_number)    # Modelo BOW
# ldaModel.save(path + 'LDAModel'+ model_number)  # Modelo entrenado
# lda.save(path + 'LDA_'+ model_number)
# idfModel.save(path + 'idfModel'+ model_number) 

### Cargar modelos

In [None]:
# path = "/content/drive/MyDrive/Modelos/modelosLDA/"
# model_number = '2'
# pipline_vectorizer_ngrams = PipelineModel.load(path + 'PipelineVectorizerModel'+ model_number )   # Modelo BOW
# #lda = LocalLDAModel.load(path + 'LDA_'+ model_number)
# #ldaModel = LocalLDAModel.load(path + 'LDAModel'+ model_number)               # Modelo entrenado

# Resultados

In [49]:
# Imprimir las palabras más frecuentes por topic
numTopics = 30  # cantidad de topicos a explorar
topics = ldaModel.describeTopics(maxTermsPerTopic=5)
ListOfIndexToWords = udf(lambda wl: list([vocabulario[w] for w in wl]))
FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl])

toptopics = topics.select((topics.topic + 1).alias('topic'),
                          ListOfIndexToWords(topics.termIndices).alias('words'),
                          FormatNumbers(topics.termWeights).alias('weights'))
toptopics.show(truncate=False, n=numTopics)
print('Topics:', numTopics, 'Vocabulary:', len(vocabArray))



+-----+-------------------------------------------------+----------------------------------------+
|topic|words                                            |weights                                 |
+-----+-------------------------------------------------+----------------------------------------+
|1    |[national, rural, return, news, price]           |[0.0186, 0.0184, 0.0175, 0.0117, 0.0107]|
|2    |[budget, fight, cut, back, federal]              |[0.0143, 0.0109, 0.0107, 0.0096, 0.0086]|
|3    |[game, plane, anti, cyclone, murray]             |[0.0145, 0.0129, 0.0123, 0.0120, 0.0119]|
|4    |[rate, probe, trump, rise, break]                |[0.0212, 0.0202, 0.0185, 0.0174, 0.0129]|
|5    |[health, chief, interview, question, change]     |[0.0245, 0.0126, 0.0124, 0.0123, 0.0120]|
|7    |[state, coronavirus, iraq, kill, india]          |[0.0200, 0.0157, 0.0150, 0.0128, 0.0124]|
|8    |[told, defence, turn, five, expected]            |[0.0184, 0.0135, 0.0131, 0.0131, 0.0125]|
|9    |[ro

In [53]:
Topic_Words = ldaModel.topicsMatrix()
Topic_Words

DenseMatrix(36584, 30, [1670.8399, 1717.1773, 1653.722, 1664.0101, 287.5126, 1289.1435, 740.0777, 1117.2444, ..., 0.4158, 0.2885, 0.2121, 0.2544, 0.2653, 0.2467, 0.2516, 0.2675], 0)

In [55]:
Doc_Topic =  ldaModel.transform(result_tfidf)
Doc_Topic

DataFrame[headlines: string, index: bigint, text: string, words: array<string>, noStopWords: string, finalwords: array<string>, 1_grams: array<string>, 2_grams: array<string>, 1_counts: vector, 2_counts: vector, features_cv: vector, features: vector, topicDistribution: vector]

In [56]:
Doc_Topic.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           headlines|index|                text|               words|         noStopWords|          finalwords|             1_grams|             2_grams|            1_counts|            2_counts|         features_cv|            features|   topicDistribution|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|[decides, communi...|[decides, communi...|[decides, communi...|[decides communit...|(16599,[111,958,5...|       (19985,[],[])|(36584,[111,958,5...|(36

In [59]:
len(Doc_Topic.select('topicDistribution').take(1)[0][0])

30

In [89]:
M_doc_top = Doc_Topic.rdd.map(lambda x: x['topicDistribution'])
distribucion_doc = M_doc_top.zipWithIndex( )   
Documentos_topics = spark.createDataFrame( distribucion_doc, ['topicDistribution','index' ])
Documentos_topics.show(2)

+--------------------+-----+
|   topicDistribution|index|
+--------------------+-----+
|[0.00104592518438...|    0|
|[9.55270301542676...|    1|
+--------------------+-----+
only showing top 2 rows



In [91]:
# Documentos_topicos = Documentos_topics.toPandas()
# Doc_top = Documentos_topicos['topicDistribution'].apply(lambda x: pd.Series(x.toArray()))

In [None]:
# !pip install pyLDAvis 

In [None]:
# #https://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepare
# visualizacion = pyLDAvis.prepare(topic_term_dists = 
#                                  doc_topic_dists = 
#                                  doc_lengths = 
#                                  vocab = 
#                                  term_frequency = 
#                                  )
# visualizacion