# Latent Dirichlet Allocation

In [1]:
# Import Sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("LDA").getOrCreate()

In [2]:
# Print PySpark and Python versions
import sys
print('Python version: '+sys.version)
print('Spark version: '+spark.version)

Python version: 3.8.4 | packaged by conda-forge | (default, Jul 17 2020, 15:16:46) 
[GCC 7.5.0]
Spark version: 3.0.0


## Load Data

In [3]:
# Read data
file_location = "lda_data.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"


df = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.load(file_location)


In [4]:
# Print Metadata
df.printSchema()

root
 |-- Pageurl: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Review Text: string (nullable = true)
 |-- Review Color: string (nullable = true)
 |-- User Verified: string (nullable = true)
 |-- Review Date: string (nullable = true)
 |-- Review Useful Count: string (nullable = true)
 |-- Configuration Text: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Declaration Text: string (nullable = true)



In [5]:
#  Count data
df.count()
print('The total number of records in the credit card dataset are '+str(df.count()))

The total number of records in the credit card dataset are 6855


In [6]:
%%bash 
pip install nltk



# Import appropriate libraries


In [7]:
# Import appropriate libraries
from pyspark.sql.types import *
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors as MLlibVectors

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data processing

In [8]:
reviews = df.rdd.map(lambda x : x['Review Text']).filter(lambda x: x is not None)
StopWords = stopwords.words("english")
tokens = reviews.map(lambda document: document.strip().lower())\
    .map( lambda document: re.split("[\s;,#]", document)) \
    .map( lambda word: [x for x in word if x.isalpha()]) \
    .map( lambda word: [x for x in word if len(x) > 3] )\
    .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()


df_txts = spark.createDataFrame(tokens, ['list_of_words','index'])
# TF
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 


In [9]:
num_topics = 5
max_iterations = 100
lda_model=LDA.train(result_tfidf.select("index", "features").rdd.mapValues(MLlibVectors.fromML).map(list),k = num_topics, maxIterations = max_iterations)

In [10]:
wordNumbers = 5  
data_topics=lda_model.describeTopics(maxTermsPerTopic = wordNumbers)
vocabArray = cvmodel.vocabulary
topicIndices = spark.sparkContext.parallelize(data_tp)
def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result



topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print (term)
    print ('\n')

NameError: name 'data_tp' is not defined