In [1]:
import findspark
findspark.init('C:\spark\spark-2.2.1-bin-hadoop2.7')

import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

import gensim
from gensim import corpora

import os

import json



In [2]:
sc = SparkContext().getOrCreate()
spark = SparkSession.builder.appName('comment topic modeling').getOrCreate()

### read data

In [3]:
article_ids = []
 = []
for folder in os.listdir('comments_groupby_article'):
    article_id = folder.split('_')[2]
    inputloc = "comments_groupby_article/" + folder + "/*.csv" 
    df = spark.read.csv(inputloc, header="true")
    comment_text = df.select("comment_text").rdd.flatMap(lambda x: x).collect()
    comment_texts.append(comment_text)
    article_ids.append(article_id)

Wall time: 0 ns


In [4]:
comment_df = pd.DataFrame({'article_ids':article_ids,'comment_texts':comment_texts})
comment_df

Unnamed: 0,article_ids,comment_texts
0,10030963,[My 2 cents: fire every single last one of the...
1,10108172,[Stephen Harper is destroying Canada's interna...
2,10193881,"['Gay marriage is a non-issue in Canada, but i..."
3,10198519,[It isn't hard to govern a country with people...
4,10199024,[This says more about the so called discrimina...
5,10200251,[This openness is a credit to the Globe. For d...
6,10508775,[What is good about good Friday? Inherently no...
7,10527549,[Ms. Wente is making many assumptions here. In...
8,10549743,"[Ken, don't project your inferiority complex u..."
9,10556177,[Excuse me but... don't you think you could go...


### Build LDA model 

In [6]:
# convert comment_texts type: list -> string
comment_df['comment_texts'] = comment_df['comment_texts'].apply(lambda x: ', '.join(x))
comment_df

Unnamed: 0,article_ids,comment_texts
0,10030963,My 2 cents: fire every single last one of them...
1,10108172,Stephen Harper is destroying Canada's internat...
2,10193881,"'Gay marriage is a non-issue in Canada, but it..."
3,10198519,It isn't hard to govern a country with people ...
4,10199024,This says more about the so called discriminat...
5,10200251,This openness is a credit to the Globe. For de...
6,10508775,What is good about good Friday? Inherently not...
7,10527549,Ms. Wente is making many assumptions here. In ...
8,10549743,"Ken, don't project your inferiority complex up..."
9,10556177,Excuse me but... don't you think you could go ...


In [7]:
doc_complete = comment_df['comment_texts'].tolist()

In [9]:
# clean text
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]   

In [13]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [15]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

In [16]:
# show comments topics
topics = ldamodel.show_topics()
for topic in topics:
    print(str(topic)+'\n')

(0, '0.010*"people" + 0.008*"woman" + 0.006*"one" + 0.006*"would" + 0.006*"nation" + 0.005*"first" + 0.005*"native" + 0.005*"like" + 0.005*"get" + 0.004*"right"')

(1, '0.014*"harper" + 0.009*"liberal" + 0.008*"party" + 0.008*"government" + 0.007*"would" + 0.007*"conservative" + 0.007*"canada" + 0.006*"canadian" + 0.006*"one" + 0.005*"like"')

(2, '0.012*"military" + 0.011*"canada" + 0.011*"war" + 0.007*"combat" + 0.007*"u" + 0.005*"canadian" + 0.005*"soldier" + 0.005*"fighter" + 0.005*"would" + 0.004*"air"')

(3, '0.011*"people" + 0.006*"gun" + 0.006*"would" + 0.006*"food" + 0.006*"drug" + 0.006*"police" + 0.005*"one" + 0.004*"u" + 0.004*"health" + 0.004*"like"')

(4, '0.010*"oil" + 0.008*"canada" + 0.007*"alberta" + 0.006*"would" + 0.005*"people" + 0.005*"u" + 0.005*"government" + 0.004*"pipeline" + 0.004*"like" + 0.004*"canadian"')

(5, '0.012*"tax" + 0.008*"year" + 0.006*"climate" + 0.006*"u" + 0.005*"would" + 0.005*"government" + 0.005*"one" + 0.005*"change" + 0.004*"canada" + 0.0

In [18]:
ldamodel[doc_term_matrix[89]]

[(0, 0.016908653),
 (1, 0.04636028),
 (4, 0.07732401),
 (5, 0.64408153),
 (7, 0.035113778),
 (8, 0.16354014),
 (9, 0.016613139)]

## extract topics of each comments and save it to csv

In [19]:
comment_topics = []
for doc_term in doc_term_matrix:
    topic = ldamodel[doc_term]
    comment_topics.append(topic)

In [21]:
comment_topic_df = pd.DataFrame({'article_ids':article_ids,'comment_texts':comment_texts, 'comment_topics': comment_topics})

In [22]:
comment_topic_df.to_csv('comment_topics.csv')