In [2]:
import os, pyspark

In [3]:
sc = pyspark.SparkContext()

In [4]:
# check if spark context is defined
print(sc.version)

2.4.0


In [5]:
# importing some libraries
import pandas as pd
import pyspark
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
# stuff we'll need for text processing
from nltk.corpus import stopwords
import re as re
from pyspark.ml.feature import CountVectorizer , IDF
# stuff we'll need for building the model

from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer

In [7]:
# reading the data
data = sqlContext.read.format("csv") \
   .options(header='true', inferschema='true') \
   .load("../data/crunchbase_orgs.csv")

In [8]:
descriptions = data.rdd.map(lambda x : x['short_description']).filter(lambda x: x is not None)
StopWords = stopwords.words("english")
lmtzr = WordNetLemmatizer()

In [9]:
tokens = descriptions.map( lambda document: document.strip().lower())               \
    .map( lambda document: re.split(" ", document))          \
    .map( lambda word: [x for x in word if x.isalpha()])           \
    .map( lambda word: [x for x in word if len(x) > 3] )           \
    .map( lambda word: [x for x in word if x not in StopWords])    \
    .map( lambda word: [lmtzr.lemmatize(x,'v') for x in word]) \
    .zipWithIndex()

In [10]:
df_txts = sqlContext.createDataFrame(tokens, ["list_of_words",'index'])

In [11]:
df_txts.show()

+--------------------+-----+
|       list_of_words|index|
+--------------------+-----+
|[wetpaint, offer,...|    0|
|[zoho, offer, sui...|    1|
|[digg, operate, w...|    2|
|[facebook, online...|    3|
|[omnidrive, provi...|    4|
|[geni, online, co...|    5|
|[flektor, platfor...|    6|
|[interactive, med...|    7|
|[twitter, global,...|    8|
|[stumbleupon, dis...|    9|
|[gizmoz, develop,...|   10|
|[scribd, digital,...|   11|
|[interactive, int...|   12|
|[online, music, d...|   13|
|[mobile, service,...|   14|
|[ebay, online, ma...|   15|
|[social, network,...|   16|
|[meetmoi, date, s...|   17|
|[postini, offer, ...|   18|
|[joost, internet,...|   19|
+--------------------+-----+
only showing top 20 rows



In [13]:
# TF
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)

In [14]:
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [15]:
result_tfidf.show()

+--------------------+-----+--------------------+--------------------+
|       list_of_words|index|        raw_features|            features|
+--------------------+-----+--------------------+--------------------+
|[wetpaint, offer,...|    0|(5000,[3,4,6,15,1...|(5000,[3,4,6,15,1...|
|[zoho, offer, sui...|    1|(5000,[3,593,810]...|(5000,[3,593,810]...|
|[digg, operate, w...|    2|(5000,[32,33,47,5...|(5000,[32,33,47,5...|
|[facebook, online...|    3|(5000,[2,4,21,27,...|(5000,[2,4,21,27,...|
|[omnidrive, provi...|    4|(5000,[1,4,32,54,...|(5000,[1,4,32,54,...|
|[geni, online, co...|    5|(5000,[4,15,34,80...|(5000,[4,15,34,80...|
|[flektor, platfor...|    6|(5000,[6,33,63,28...|(5000,[6,33,63,28...|
|[interactive, med...|    7|(5000,[0,26,27,37...|(5000,[0,26,27,37...|
|[twitter, global,...|    8|(5000,[6,21,27,32...|(5000,[6,21,27,32...|
|[stumbleupon, dis...|    9|(5000,[68,71,191,...|(5000,[68,71,191,...|
|[gizmoz, develop,...|   10|(5000,[10,11,25,3...|(5000,[10,11,25,3...|
|[scri

In [16]:
import time

num_topics = 12
max_iterations = 50
start = time.time()
lda_model = LDA.train(result_tfidf.select('index','features').rdd.mapValues(Vectors.fromML).map(list), k=num_topics, maxIterations=max_iterations)
print("Time to train LDA" + str(time.time() - start))

Time to train LDA2465.906224966049


In [26]:
wordNumbers = 15  
topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))
topic_list = topicIndices.toDF().select('*').collect()
for i in range(0,11):
    print("Topic cluster: ", i)
    for index in topic_list[i]._1:
        print(cvmodel.vocabulary[index])


Topic cluster:  0
company
provide
service
offer
online
solutions
market
software
technology
platform
develop
design
business
mobile
management
Topic cluster:  1
company
provide
service
offer
online
platform
market
software
solutions
develop
mobile
technology
help
design
management
Topic cluster:  2
company
provide
service
offer
online
market
solutions
software
platform
management
technology
business
mobile
develop
help
Topic cluster:  3
company
provide
service
offer
online
market
platform
software
solutions
mobile
design
management
business
technology
develop
Topic cluster:  4
company
provide
service
offer
online
platform
software
solutions
market
mobile
technology
management
business
develop
help
Topic cluster:  5
company
provide
service
offer
online
platform
software
solutions
market
mobile
management
technology
business
design
develop
Topic cluster:  6
company
provide
service
offer
online
solutions
software
platform
market
mobile
technology
develop
design
digital
business
Topic clus

In [None]:
#topics = lda_model.topicsMatrix()

In [27]:
topics = lda_model.describeTopics(maxTermsPerTopic = 10)
for x, topic in enumerate(topics):
    print('TOPIC #: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print(cvmodel.vocabulary[words[n]] + ' ' + str(weights[n]))
    print()

TOPIC #: 0
company 0.009611703337536722
provide 0.008422405325344292
service 0.007911473989159033
offer 0.006521404179138772
online 0.005469394873732726
solutions 0.005180403201146749
market 0.005152727129959714
software 0.005025878815978552
technology 0.004558800348608596
platform 0.004500155594070384

TOPIC #: 1
company 0.00878852980635668
provide 0.007894481927577914
service 0.007259428868310243
offer 0.006117265468788435
online 0.005836047199793435
platform 0.005066649286008296
market 0.004912543239088004
software 0.004803896345932743
solutions 0.0047460748697289975
develop 0.004466627658153043

TOPIC #: 2
company 0.009397782458577641
provide 0.008237862264944275
service 0.008073037770973221
offer 0.0062105383792532024
online 0.00546313852600786
market 0.004997482667822664
solutions 0.0049930951659244265
software 0.004945539311464607
platform 0.004858218056719911
management 0.004460203260949737

TOPIC #: 3
company 0.00932940882102931
provide 0.007945871924811903
service 0.007458827