<a href="https://colab.research.google.com/github/AnIsAsPe/LDATopicModeling_pyspark/blob/main/LDA_con_pySpark_ngrams_lematizacion_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalación de PySpark en Colab

In [2]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 84kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 22.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=737463d20f8a73e32e41265b25ad5f4ed75e1d1c968ee395e27b11723bcbae8f
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [3]:
import os
os.cpu_count()

2

In [4]:
!echo $(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024)))

12993


## Crear Sesión Colab

In [5]:
# Crear una sesión de spark

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [6]:
spark

## Importar bibliotecas 

In [7]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, concat, split, col
from pyspark.ml.feature import RegexTokenizer, NGram, VectorAssembler, CountVectorizer, IDF
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import  CountVectorizerModel
from pyspark.ml.clustering import LocalLDAModel
from nltk.stem import WordNetLemmatizer


import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Leer los datos

In [10]:
df = spark.read.csv("/content/drive/MyDrive/Datos/abcnews-date-text.csv",header=True)

print('Cantidad de renglones: ', df.count())

Cantidad de renglones:  1226258


In [11]:
df.printSchema() # explorar la estructura el dataframe

root
 |-- publish_date: string (nullable = true)
 |-- headline_text: string (nullable = true)



In [12]:
df.head(3)

[Row(publish_date='20030219', headline_text='aba decides against community broadcasting licence'),
 Row(publish_date='20030219', headline_text='act fire witnesses must be aware of defamation'),
 Row(publish_date='20030219', headline_text='a g calls for infrastructure protection summit')]

In [13]:
df.show(10) # default 20 renglones

+------------+--------------------+
|publish_date|       headline_text|
+------------+--------------------+
|    20030219|aba decides again...|
|    20030219|act fire witnesse...|
|    20030219|a g calls for inf...|
|    20030219|air nz staff in a...|
|    20030219|air nz strike to ...|
|    20030219|ambitious olsson ...|
|    20030219|antic delighted w...|
|    20030219|aussie qualifier ...|
|    20030219|aust addresses un...|
|    20030219|australia is lock...|
+------------+--------------------+
only showing top 10 rows



In [14]:
df.select('headline_text').take(1)

[Row(headline_text='aba decides against community broadcasting licence')]

In [15]:
# raw text of the first entry 
df.select('headline_text').head(1)[0][0]

'aba decides against community broadcasting licence'

In [16]:
type(df)

pyspark.sql.dataframe.DataFrame

Queremos un inice consecutivo, para ello vamos a utilizar solo la columna 'hedline_text' y usando rdd creamos el indice

In [17]:
texts = df.rdd.map(lambda x: x['headline_text'])
headlines=texts.zipWithIndex( )                    

In [18]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)
#Creating dataframe
data = sqlContext.createDataFrame(headlines, ["headlines",'index'])

# Preprocesar Texto

## Normalizar y tokenizar

In [19]:
removePunct = udf(lambda s: s.strip().lower(), StringType())

data_norm = data.withColumn("text", removePunct(data.headlines))

In [20]:
# tokenize 
tokenizer = RegexTokenizer(inputCol="text", outputCol="words",
                           gaps=True, pattern=r'\s+', minTokenLength=4)
df_tokens = tokenizer.transform(data_norm)

In [21]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+
|           headlines|index|                text|               words|
+--------------------+-----+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|
|antic delighted w...|    6|antic delighted w...|[antic, delighted...|
|aussie qualifier ...|    7|aussie qualifier ...|[aussie, qualifie...|
|aust addresses un...|    8|aust addresses un...|[aust, addresses,...|
|australia is lock...|    9|australia is lock...|[australia, locke...|
|australia to cont...|   10|australia to cont...|[australia, contr...|
|barca

## Removing stopwords

In [22]:
stopwords = stopwords.words("english")
removeStop=udf(lambda word: [x for x in word if x not in stopwords])
df_tokens=df_tokens.withColumn('noStopWords',removeStop(df_tokens['words']))

In [23]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+--------------------+
|           headlines|index|                text|               words|         noStopWords|
+--------------------+-----+--------------------+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|[decides, communi...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|[fire, witnesses,...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|[calls, infrastru...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|[ambitious, olsso...|
|antic delighted w...|    6|antic delighted w...|[antic, delighted...|[antic, delighted...|
|aussie qualifier ...|    7|aussie qualifier ...|[aussie, qualifie...|[aussie, q

## Lematización

In [24]:
lemma= WordNetLemmatizer()
def lematizacion(in_vec):
    out_vec = [lemma.lemmatize(w) for w in in_vec]
    return out_vec

lemma_udf = udf(lambda x:lematizacion(x),ArrayType(StringType()))
df_tokens=df_tokens.withColumn('finalwords',lemma_udf(df_tokens['noStopWords']))


In [25]:
df_tokens.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|           headlines|index|                text|               words|         noStopWords|          finalwords|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|aba decides again...|    0|aba decides again...|[decides, against...|[decides, communi...|[decides, communi...|
|act fire witnesse...|    1|act fire witnesse...|[fire, witnesses,...|[fire, witnesses,...|[fire, witness, m...|
|a g calls for inf...|    2|a g calls for inf...|[calls, infrastru...|[calls, infrastru...|[call, infrastruc...|
|air nz staff in a...|    3|air nz staff in a...|[staff, aust, str...|[staff, aust, str...|[staff, aust, str...|
|air nz strike to ...|    4|air nz strike to ...|[strike, affect, ...|[strike, affect, ...|[strike, affect, ...|
|ambitious olsson ...|    5|ambitious olsson ...|[ambitious, olsso...|[ambitious, olsso...|[ambi

In [26]:
df_tokens.printSchema()

root
 |-- headlines: string (nullable = true)
 |-- index: long (nullable = true)
 |-- text: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- noStopWords: string (nullable = true)
 |-- finalwords: array (nullable = true)
 |    |-- element: string (containsNull = true)



###N-Grams

## Vectorizar con unigramas y bigramas

En pyspark tenemos que primero que crear todos los n-gramas que nos interesen y luego utilizar el modelo CountVectorizer y unirlo todo utilizando VectorAssembler

In [27]:
def build_ngrams(inputCol="finalwords", n=2):

    ngrams = [
        NGram(n=i, inputCol="finalwords", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i), minDF=20)
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="features_cv"
    )]

    return Pipeline(stages=ngrams + vectorizers + assembler)

In [28]:
%%time
# TF
pipline_vectorizer_ngrams = build_ngrams().fit(df_tokens)


CPU times: user 577 ms, sys: 70.7 ms, total: 648 ms
Wall time: 2min 29s


In [29]:
pipline_vectorizer_ngrams.stages

[NGram_425ddb78a44e,
 NGram_a4e1798d38dd,
 CountVectorizerModel: uid=CountVectorizer_2843d8e3b008, vocabularySize=16599,
 CountVectorizerModel: uid=CountVectorizer_cd1a46cd2af6, vocabularySize=19985,
 VectorAssembler_75e2e8977f81]

In [30]:
vectorizers = [s for s in pipline_vectorizer_ngrams.stages if isinstance(s, CountVectorizerModel)]
vocabArray = [v.vocabulary for v in vectorizers]
len(vocabArray)

2

In [31]:
# Con cuantas palabras y bigramas nos quedamos
palabras=vocabArray[0]
biGramas=vocabArray[1]

In [32]:
# muestra de bigramas
biGramas[0:20]

['gold coast',
 'country hour',
 'donald trump',
 'face court',
 'pleads guilty',
 'asylum seeker',
 'mental health',
 'climate change',
 'police investigate',
 'north korea',
 'police probe',
 'broken hill',
 'share market',
 'rate rise',
 'royal commission',
 'police officer',
 'plane crash',
 'body found',
 'front court',
 'govt urged']

In [33]:
vocabulario = palabras + biGramas
len(vocabulario)

36584

__ahora hacemos la transformación con la vectorización hecha para obtener TF__

In [34]:
%%time
result_cv = pipline_vectorizer_ngrams.transform(df_tokens)

CPU times: user 20.4 ms, sys: 1.5 ms, total: 21.9 ms
Wall time: 803 ms


In [35]:
result_cv.columns  # la vectorización se encuentra en la última columna

['headlines',
 'index',
 'text',
 'words',
 'noStopWords',
 'finalwords',
 '1_grams',
 '2_grams',
 '1_counts',
 '2_counts',
 'features_cv']

In [36]:
result_cv.take(1)

[Row(headlines='aba decides against community broadcasting licence', index=0, text='aba decides against community broadcasting licence', words=['decides', 'against', 'community', 'broadcasting', 'licence'], noStopWords='[decides, community, broadcasting, licence]', finalwords=['decides', 'community', 'broadcasting', 'licence'], 1_grams=['decides', 'community', 'broadcasting', 'licence'], 2_grams=['decides community', 'community broadcasting', 'broadcasting licence'], 1_counts=SparseVector(16599, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), 2_counts=SparseVector(19985, {}), features_cv=SparseVector(36584, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}))]

__es el turno de obtener IDF__

In [37]:
%%time
# IDF
idf = IDF(inputCol="features_cv", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

CPU times: user 286 ms, sys: 38.7 ms, total: 325 ms
Wall time: 1min 17s


In [38]:
result_tfidf.take(1)

[Row(headlines='aba decides against community broadcasting licence', index=0, text='aba decides against community broadcasting licence', words=['decides', 'against', 'community', 'broadcasting', 'licence'], noStopWords='[decides, community, broadcasting, licence]', finalwords=['decides', 'community', 'broadcasting', 'licence'], 1_grams=['decides', 'community', 'broadcasting', 'licence'], 2_grams=['decides community', 'community broadcasting', 'broadcasting licence'], 1_counts=SparseVector(16599, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), 2_counts=SparseVector(19985, {}), features_cv=SparseVector(36584, {111: 1.0, 958: 1.0, 5213: 1.0, 8099: 1.0}), features=SparseVector(36584, {111: 5.2528, 958: 6.8735, 5213: 9.0088, 8099: 9.729}))]

## Train Model

In [40]:
%%time
num_topics=30
max_iterations=50
lda = LDA(k=num_topics, maxIter=max_iterations)
ldaModel = lda.fit(result_tfidf)

CPU times: user 2.18 s, sys: 235 ms, total: 2.42 s
Wall time: 10min 28s


### Guardar los modelos

In [44]:
print(ldaModel.isDistributed())
path = "/content/drive/MyDrive/Modelos/modelosLDA/"

model_number = '2'
pipline_vectorizer_ngrams.save(path + 'PipelineVectorizerModel'+ model_number)    # Modelo BOW
ldaModel.save(path + 'LDAModel'+ model_number)  # Modelo entrenado
lda.save(path + 'LDA_'+ model_number)
idfModel.save(path + 'idfModel'+ model_number) 

# Cargar modelos

In [None]:
path = "/content/drive/MyDrive/Modelos/modelosLDA/"
model_number = '2'
pipline_vectorizer_ngrams = PipelineModel.load(path + 'PipelineVectorizerModel'+ model_number )   # Modelo BOW
#lda = LocalLDAModel.load(path + 'LDA_'+ model_number)
ldaModel = LocalLDAModel.load(path + 'LDAModel'+ model_number)               # Modelo entrenado

In [42]:
# Print topics and top-weighted terms
numTopics = 30  # cantidad de topicos a explorar
topics = ldaModel.describeTopics(maxTermsPerTopic=5)
ListOfIndexToWords = udf(lambda wl: list([vocabulario[w] for w in wl]))
FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl])

toptopics = topics.select((topics.topic + 1).alias('topic'),
                          ListOfIndexToWords(topics.termIndices).alias('words'),
                          FormatNumbers(topics.termWeights).alias('weights'))
toptopics.show(truncate=False, n=numTopics)
print('Topics:', numTopics, 'Vocabulary:', len(vocabArray))



+-----+------------------------------------------------+----------------------------------------+
|topic|words                                           |weights                                 |
+-----+------------------------------------------------+----------------------------------------+
|1    |[second, flood, boost, start, england]          |[0.0118, 0.0113, 0.0111, 0.0106, 0.0096]|
|2    |[claim, dead, three, shot, killed]              |[0.0203, 0.0159, 0.0111, 0.0094, 0.0084]|
|3    |[child, probe, fire, police, abuse]             |[0.0235, 0.0170, 0.0138, 0.0113, 0.0104]|
|4    |[price, close, jailed, officer, china]          |[0.0171, 0.0161, 0.0159, 0.0133, 0.0125]|
|5    |[council, land, iraq, drug, troop]              |[0.0185, 0.0132, 0.0118, 0.0088, 0.0086]|
|6    |[year, west, road, storm, blue]                 |[0.0134, 0.0108, 0.0101, 0.0096, 0.0096]|
|7    |[market, local, body, aussie, medium]           |[0.0191, 0.0126, 0.0121, 0.0118, 0.0116]|
|8    |[action, lega