In [27]:
from pyspark import Row
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf, concat, lit, struct
from pyspark.sql.functions import monotonically_increasing_id, size, length
import re
from string import digits
from pyspark.sql import Window
from pyspark.sql import functions as F
from nltk.corpus import stopwords
import nltk
import pandas as pd
from pyspark.ml.feature import Tokenizer,CountVectorizer 
from nltk.stem.porter import *
from pyspark.ml.clustering import LDA
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint,Vectors,array
from operator import itemgetter
from textblob import TextBlob

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
amazon_reviews = sqlContext.read.json('/home/ubuntu/Cell_Phones_and_Accessories_5.json')
amazon_reviews.first()
amazon_reviews.registerTempTable("reviews")
sqlContext.sql("SELECT * FROM reviews LIMIT 2").toPandas()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,120401325X,"[0, 0]",4.0,They look good and stick good! I just don't li...,"05 21, 2014",A30TL5EWN6DFXT,christina,Looks Good,1400630400
1,120401325X,"[0, 0]",5.0,These stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,emily l.,Really great product.,1389657600


In [4]:
amazon_reviews.limit(2).toPandas()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,120401325X,"[0, 0]",4.0,They look good and stick good! I just don't li...,"05 21, 2014",A30TL5EWN6DFXT,christina,Looks Good,1400630400
1,120401325X,"[0, 0]",5.0,These stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,emily l.,Really great product.,1389657600


In [5]:
amazon_reviews = amazon_reviews.withColumn('helpfulness',(amazon_reviews.helpful[0]/amazon_reviews.helpful[1]))
amazon_reviews = amazon_reviews.fillna(0)

In [6]:
amazon_reviews.describe().show()

+-------+------------------+--------------------+-------------------+
|summary|           overall|      unixReviewTime|        helpfulness|
+-------+------------------+--------------------+-------------------+
|  count|            194439|              194439|             194439|
|   mean| 4.129912208970422|1.3687136230571027E9|0.19305457870106593|
| stddev|1.2224991825084577|3.2300324670308113E7| 0.3724174398848259|
|    min|               1.0|           982800000|                0.0|
|    max|               5.0|          1406073600|                1.0|
+-------+------------------+--------------------+-------------------+



In [7]:
helpful_reviews = amazon_reviews.filter(amazon_reviews.helpfulness > 0).count()

In [8]:
print ("Propotion of reviews having helpfulness greater than zero is" + " " + str(helpful_reviews/float(194439)))

Propotion of reviews having helpfulness greater than zero is 0.22444057005024712


In [9]:
#Sample
sample_string = "I dont know!. I am just checking233. I hope this works-well1233"
from string import digits
import re
cleanString = re.sub('\W+',' ', sample_string )
print(cleanString)

remove_digits = str.maketrans("","",digits)
res = cleanString.translate(remove_digits)
print (res)

I dont know I am just checking233 I hope this works well1233
I dont know I am just checking I hope this works well


In [9]:
amazon_reviews = amazon_reviews.withColumn("id", monotonically_increasing_id())
amazon_reviews.limit(2).toPandas()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpfulness,id
0,120401325X,"[0, 0]",4.0,They look good and stick good! I just don't li...,"05 21, 2014",A30TL5EWN6DFXT,christina,Looks Good,1400630400,0.0,0
1,120401325X,"[0, 0]",5.0,These stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,emily l.,Really great product.,1389657600,0.0,1


In [10]:

def remove_spl_numeric(review):
    review = review.replace("'","").lower()
    cleanString = re.sub('\W+',' ', review )
    remove_digits = str.maketrans("","",digits)
    res = cleanString.translate(remove_digits)
    return res

review_clean = udf(remove_spl_numeric,StringType())
amazon_reviews = amazon_reviews.withColumn("reviewText_clean",review_clean(amazon_reviews.reviewText))

amazon_reviews.select("reviewText_clean").show(4,False)
    
  

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviewText_clean                                                                                                                                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|they look good and stick good i just dont like the rounded shape because i was always bumping it and siri kept popping up and it was irritating i just wont buy a product like this aga

In [11]:
product_count = amazon_reviews.select("asin").rdd.map(lambda asin:(asin,1)).reduceByKey(lambda a,b:a+b).toDF()

In [12]:
product_count.printSchema()

root
 |-- _1: struct (nullable = true)
 |    |-- asin: string (nullable = true)
 |-- _2: long (nullable = true)



In [13]:
def extract_one(asin):
    return asin[0]

extract = udf(extract_one,StringType())
product_count = product_count.withColumn("_1",extract(product_count._1)).withColumnRenamed("_1","asin")

In [14]:
product_count = product_count.withColumnRenamed("_2","count")

In [15]:
product_count.registerTempTable("prod_count")
product_count = sqlContext.sql("SELECT *,sum(count) OVER (ORDER BY COUNT DESC ROWS UNBOUNDED PRECEDING) AS CUM_SUM FROM prod_count")
product_count.limit(10).toPandas()

Unnamed: 0,asin,count,CUM_SUM
0,B005SUHPO6,837,837
1,B0042FV2SI,694,1531
2,B008OHNZI0,657,2188
3,B009RXU59C,636,2824
4,B000S5Q9CA,628,3452
5,B007FHX9OK,557,4009
6,B0073FCPSK,526,4535
7,B008DJIIG8,510,5045
8,B0088LYCZC,488,5533
9,B0015RB39O,466,5999


In [16]:
product_count = product_count.withColumn("per_reviews",(product_count.CUM_SUM/float(194439)))
product_count.limit(2).toPandas()

Unnamed: 0,asin,count,CUM_SUM,per_reviews
0,B005SUHPO6,837,837,0.004305
1,B0042FV2SI,694,1531,0.007874


In [17]:
product_count = product_count.filter(product_count.per_reviews <= 0.70)

In [18]:
#Filtering for products which account for ~70% of total reviews
amazon_reviews = amazon_reviews.select("id","asin","reviewText_clean","helpfulness","reviewText")

#Stopwords
#Removing the stop words from the reviews
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")

#Stopwords removal
def stop_words_removal(review):
    review = ' '.join([word for word in review.split() if word not in cachedStopWords])
    return review
stop_words = udf(stop_words_removal,StringType())
amazon_reviews = amazon_reviews.withColumn("review_wo_stop",stop_words(amazon_reviews.reviewText_clean))

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Tokenize
tokenizer = Tokenizer(inputCol="review_wo_stop",outputCol="review_token")
tokenized = tokenizer.transform(amazon_reviews)

#Lemmatization

#Stemming
stemmer = PorterStemmer()

#Function for stemming
def stem_words(rev_tok):
    rev_stem = []
    for word in rev_tok:
        stem_word = stemmer.stem(word)
        if len(stem_word) > 2:
            rev_stem.append(stem_word)
    return rev_stem

stem = udf(stem_words,ArrayType(StringType()))

stemmed = tokenized.withColumn("rev_stemmed",stem(tokenized.review_token))

stemmed.select("review_token","rev_stemmed").limit(2).toPandas()  


In [20]:
product_review = stemmed.select("asin","rev_stemmed").rdd.map(lambda row:(row[0],row[1])).reduceByKey(lambda a,b :a +b).toDF()
product_review.limit(2).toPandas()

Unnamed: 0,_1,_2
0,B001UO45SI,"[complain, someth, would, buy, long, term, use..."
1,B00IZOU1JS,"[work, flawlessli, howev, enjoy, sleep, dark, ..."


In [21]:
product_review = product_review.withColumnRenamed("_1","asin")
product_review = product_review.withColumnRenamed("_2","Reviews_combined")
product_review.count()

10429

In [22]:
product_count.count()

3104

In [53]:
top_rev_prod = product_count.join(product_review,"asin", how = "left" )
top_rev_prod.count()

3104

In [54]:
#helpful_rev_fil = top_rev_prod.filter(top_rev_prod.helpfulness > 0).count()
top_rev_prod = top_rev_prod.withColumn("id", monotonically_increasing_id())
top_rev_prod.columns
#print ("Propotion of reviews having helpfulness greater than zero is" + " " + str(helpful_rev_fil/float(136096)))

['asin', 'count', 'CUM_SUM', 'per_reviews', 'Reviews_combined', 'id']

In [34]:
# vectorize
cv = CountVectorizer(inputCol="Reviews_combined", outputCol="vectors")
count_vectorizer_model = cv.fit(top_rev_prod)
final_review = count_vectorizer_model.transform(top_rev_prod)

In [35]:
#Creating the corpus
corpus = final_review.select(F.col("id").cast("long"),"vectors")
corpus = corpus.withColumnRenamed("id","label")
corpus = corpus.withColumnRenamed("vectors","features")
 

In [41]:

#training data
lda = LDA(k=3, seed=12, maxIter=20)
model = lda.fit(corpus)

In [42]:
# extracting topics
topics = model.describeTopics(maxTermsPerTopic=10)


In [43]:
#extraction vocabulary
vocabulary = count_vectorizer_model.vocabulary


In [44]:
indices_rdd = topics.select("termIndices").rdd.map(lambda x : x[0]).map(lambda y : [vocabulary[i] for i in y])

In [45]:
indices_rdd.collect()

[['phone',
  'case',
  'use',
  'charg',
  'one',
  'like',
  'work',
  'batteri',
  'great',
  'screen'],
 ['case',
  'phone',
  'use',
  'screen',
  'one',
  'protect',
  'get',
  'like',
  'work',
  'well'],
 ['charg',
  'phone',
  'use',
  'batteri',
  'one',
  'charger',
  'devic',
  'like',
  'work',
  'time']]

In [46]:
# We can see that two most significant topics are about bettery/charge and case/phone/screen
#Get the weights for these words to pick the words of importance in these topics
indices_weight = topics.select("termWeights").rdd.map(lambda x: x[0])

**Filter the sentences in the review based on the identified topics**

In [47]:
topic_1 = indices_rdd.take(2)[1]

topic_2 = indices_rdd.take(3)[2]


In [50]:
topic_1

['case',
 'phone',
 'use',
 'screen',
 'one',
 'protect',
 'get',
 'like',
 'work',
 'well']

In [51]:
topic_2

['charg',
 'phone',
 'use',
 'batteri',
 'one',
 'charger',
 'devic',
 'like',
 'work',
 'time']

** Filtering for keywords from reviews and saving it in spearate columns for each review**

In [55]:
#Filter for products contributing to ~70% of total reviews
top_rev_prod = top_rev_prod.withColumn("flag",lit(1))



In [56]:
#Pull the flag to the reviews table
review_topic = stemmed.join(top_rev_prod.select("asin","flag"),"asin", how = "left")

In [62]:
review_topic = review_topic.fillna(0)
review_topic = review_topic.filter(review_topic.flag == 1)

In [19]:
# Filter for key words from the reviews
topic_1 = ['case','screen','protect','get','work','well']
topic_2 = ['charge','battery','charger','device','work','time']



In [20]:
amazon_reviews.printSchema()

root
 |-- id: long (nullable = false)
 |-- asin: string (nullable = true)
 |-- reviewText_clean: string (nullable = true)
 |-- helpfulness: double (nullable = false)
 |-- reviewText: string (nullable = true)
 |-- review_wo_stop: string (nullable = true)



In [21]:
def assign_topic_1(rev):
    rev = rev.replace("'","").lower()
    top_1_sent = []
    
    #split it by full stop
    review_sent = rev.split('.')
    for sent in review_sent:
        cleanString = re.sub('\W+',' ', sent)
        remove_digits = str.maketrans("","",digits)
        res = cleanString.translate(remove_digits)
        res = res.split()
        len_top_1 = len(set(res).intersection(set(topic_1))) 
        if len_top_1 > 0:
            top_1_sent.append(sent)
    return top_1_sent

top_1 = udf(assign_topic_1,StringType())
amazon_reviews = amazon_reviews.withColumn("topic_1",top_1(amazon_reviews.reviewText))
            
amazon_reviews.printSchema()

root
 |-- id: long (nullable = false)
 |-- asin: string (nullable = true)
 |-- reviewText_clean: string (nullable = true)
 |-- helpfulness: double (nullable = false)
 |-- reviewText: string (nullable = true)
 |-- review_wo_stop: string (nullable = true)
 |-- topic_1: string (nullable = true)



In [22]:
def assign_topic_2(rev):
    rev = rev.replace("'","").lower()
    top_2_sent = []
    
    #split it by full stop
    review_sent = rev.split('.')
    for sent in review_sent:
        
        cleanString = re.sub('\W+',' ', sent)
        remove_digits = str.maketrans("","",digits)
        res = cleanString.translate(remove_digits)
        res = res.split()
       
        len_top_2 = len(set(res).intersection(set(topic_2))) 
        if len_top_2 > 0:
            top_2_sent.append(sent)
    print (type(top_2_sent))
    return top_2_sent

top_2 = udf(assign_topic_2,StringType())
amazon_reviews = amazon_reviews.withColumn("topic_2",top_2(amazon_reviews.reviewText))
            
amazon_reviews.printSchema()

root
 |-- id: long (nullable = false)
 |-- asin: string (nullable = true)
 |-- reviewText_clean: string (nullable = true)
 |-- helpfulness: double (nullable = false)
 |-- reviewText: string (nullable = true)
 |-- review_wo_stop: string (nullable = true)
 |-- topic_1: string (nullable = true)
 |-- topic_2: string (nullable = true)



In [49]:
def senti(rev):
    return TextBlob(str(rev)).sentiment.polarity
def sub(rev):
    return TextBlob(str(rev)).sentiment.subjectivity

sentiment = udf(senti,FloatType())
subjectivity = udf(sub,FloatType())
amazon_reviews = amazon_reviews.withColumn("topic_1_pol",sentiment(amazon_reviews.topic_1))
amazon_reviews = amazon_reviews.withColumn("topic_1_sub",sentiment(amazon_reviews.topic_1))
amazon_reviews = amazon_reviews.withColumn("topic_2_pol",sentiment(amazon_reviews.topic_2))
amazon_reviews = amazon_reviews.withColumn("topic_2_sub",sentiment(amazon_reviews.topic_2))

In [51]:
amazon_reviews.select("topic_2_pol").show(10)

+-----------+
|topic_2_pol|
+-----------+
|        0.0|
|        0.0|
|        0.0|
|        0.9|
|        0.0|
|        0.0|
|        0.0|
|      0.125|
|       0.35|
|        0.0|
+-----------+
only showing top 10 rows



In [52]:
amazon_reviews_pol = amazon_reviews.filter((amazon_reviews.topic_2_pol != 0))
amazon_reviews_pol.count()

KeyboardInterrupt: 