In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "2") \
        .config("spark.executor.memory","1g") \
        .config("spark.driver.memory","2g") \
        .config("spark.executor.cores",'1') \
        .config("spark.scheduler.mode","FIFO") \
        .getOrCreate()

In [3]:
from pyspark.sql import DataFrame
from pyspark.sql import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.tuning import *
from pyspark.ml.feature import Imputer

In [4]:
import numpy as np
import pandas as pd
import scipy as sc

In [5]:
dat = spark.read.csv("/del2/Reviews.csv", header=True, inferSchema=True)

In [6]:
dat = dat.repartition(8)

In [7]:
dat.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [8]:
dat.take(1)

[Row(Id=250076, ProductId='B0029NII3C', UserId='ASALATXVOO93K', ProfileName='"OC Mom ""OC Mom"""', HelpfulnessNumerator='0', HelpfulnessDenominator='0', Score='3', Time='1337040000', Summary='My cats love this smelly fish', Text="My 2 cats are picky tuna lovers and this was all that was available at the time. They still love this, but the smelly fish scent is very strong and the kitchen smells for a bit after they eat it. They still love it. But I don't like the smell. I'd buy it again if there was no other choice. But if there is tuna, I'd go for that instead.")]

Seeing the details of the data and dropping irrelevant columns.

In [9]:
dat.drop("ProductId", "UserId", "ProfileName", "Time")

DataFrame[Id: int, HelpfulnessNumerator: string, HelpfulnessDenominator: string, Score: string, Summary: string, Text: string]

In [10]:
dat.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [11]:
score=dat["Score"]

In [12]:
dat.show()

+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|    Id| ProductId|        UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+------+----------+--------------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|250076|B0029NII3C| ASALATXVOO93K| "OC Mom ""OC Mom"""|                   0|                     0|    3|1337040000|My cats love this...|My 2 cats are pic...|
|250084|B0029NII3C| AYAH5HYV62EXV|    Connie S. Miller|                   0|                     0|    5|1327622400|"Just what ""The ...|"""The Cat"" will...|
|250092|B0029NII3C| A1RET8URKV6NV|              Melvin|                   0|                     0|    5|1318896000|My Cat Loves Whis...|"Cats can be very...|
|250100|B0029NII3C|A2FBIW0WJU5X3Y|Babsbny earn

Changing the number of stars to an integer to do analysis on and then dropping the columns

In [13]:
dat=dat.withColumn("Stars", col("Score").cast(IntegerType()))
dat=dat.withColumn("Likes", col("HelpfulnessNumerator").cast(IntegerType()))
dat=dat.withColumn("TotalHelp", col("HelpfulnessDenominator").cast(IntegerType()))


In [14]:
dat=dat.drop("Score","HelpfulnessNumerator","HelpfulnessDemoninator")

In [15]:
dat.groupby("Id").avg("Stars").show()

+-----+----------+
|   Id|avg(Stars)|
+-----+----------+
| 2659|       5.0|
| 5803|       5.0|
| 9427|       5.0|
|12027|       5.0|
|15619|       5.0|
|16339|       5.0|
|18051|       5.0|
|18979|       5.0|
|20683|       5.0|
|23571|       5.0|
|24171|       1.0|
|24347|       4.0|
|26755|       4.0|
|31035|       5.0|
|32539|       5.0|
|35947|       4.0|
|36131|       5.0|
|36355|       5.0|
|37251|       5.0|
|37307|       5.0|
+-----+----------+
only showing top 20 rows



In [16]:
dat.select("Summary").show()

+--------------------+
|             Summary|
+--------------------+
|"""Delight"" says...|
|The Best Hot Sauc...|
|  GREAT SWEET CANDY!|
|     Nasty No flavor|
|Great Irish oatme...|
|          Food-Great|
|       Don't like it|
|Awsome - Kids in ...|
|Low Carb Alternat...|
|     nothing special|
|Forget Molecular ...|
|Great food for my...|
|Perfect for our E...|
|       disappointing|
|          Tea review|
| Good for Feline UTI|
|Amazing to the la...|
|    Simply the BEST!|
|BROKEN BOTTLE BOT...|
|              JELL-O|
+--------------------+
only showing top 20 rows



In [17]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

The following code runs the TF-IDF hashing method to convert the words to a feature vector.

In [18]:
tokenizer = Tokenizer(inputCol="Summary", outputCol="words")
wordsData = tokenizer.transform(dat)
dat = wordsData
tokenizer2 = Tokenizer(inputCol="Text", outputCol="wordsText")
textData = tokenizer2.transform(dat)
dat = textData

In [19]:
dat.printSchema()
dat.take(1)

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Stars: integer (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- TotalHelp: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- wordsText: array (nullable = true)
 |    |-- element: string (containsNull = true)



[Row(Id=3, ProductId='B000LQOCH0', UserId='ABXLMWJIXXAIN', ProfileName='"Natalia Corres ""Natalia Corres"""', HelpfulnessDenominator='1', Time='1219017600', Summary='"""Delight"" says it all"', Text='"This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' ""The Lion', Stars=4, Likes=1, TotalHelp=1, words=['"""delight""', 'says', 'it', 'all"'], wordsText=['"this', 'is', 'a', 'confection', 'that', 'has', 'been', 'around', 'a', 'few', 'centuries.', '', 'it', 'is', 'a', 'light,', 'pillowy', 'citrus', 'gelatin', 'with', 'nuts', '-', 'in', 'this', 'case', 'filberts.', 'and', 'it', 'is', 'cut', 'into', 'tiny', 'squares', 'and', 'then', 'liberally', 'coated', 'with', 'powdered',

In [20]:
hashingTF = HashingTF(inputCol="words", outputCol="rawSum")
dat = hashingTF.transform(dat)

hashingTF2 = HashingTF(inputCol="wordsText", outputCol="rawWords")
featurizedData = hashingTF2.transform(dat)


In [21]:
featurizedData.select("rawSum").show()

+--------------------+
|              rawSum|
+--------------------+
|(262144,[37852,41...|
|(262144,[29582,49...|
|(262144,[2692,378...|
|(262144,[8610,156...|
|(262144,[17444,17...|
|(262144,[7367,961...|
|(262144,[103838,1...|
|(262144,[9639,378...|
|(262144,[138356,2...|
|(262144,[2437,918...|
|(262144,[2437,363...|
|(262144,[9639,592...|
|(262144,[16332,23...|
|(262144,[13963,87...|
|(262144,[122823,1...|
|(262144,[96638,13...|
|(262144,[9639,378...|
|(262144,[16332,75...|
|(262144,[78,85161...|
|(262144,[12888,16...|
+--------------------+
only showing top 20 rows



In [22]:
idf = IDF(inputCol="rawSum", outputCol="featuresSum")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


In [23]:
rescaledData.select("featuresSum").show()

+--------------------+
|         featuresSum|
+--------------------+
|(262144,[10804,86...|
|(262144,[42343,10...|
|(262144,[138356,1...|
|(262144,[93123,15...|
|(262144,[1911,163...|
|(262144,[18469],[...|
|(262144,[86175,11...|
|(262144,[33933,45...|
|(262144,[38270,61...|
|(262144,[46252,15...|
|(262144,[33209,45...|
|(262144,[16332,37...|
|(262144,[9616,163...|
|(262144,[77142],[...|
|(262144,[127412,2...|
|(262144,[16332,35...|
|(262144,[5381,164...|
|(262144,[16457,10...|
|(262144,[61666,12...|
|(262144,[47237],[...|
+--------------------+
only showing top 20 rows



We are performing our actual machine learning on the summary column as we feel there are more key words in the summary when compared to the discussion.

In [24]:
idf = IDF(inputCol="rawSum", outputCol="featuresText")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("featuresText").show()


+--------------------+
|        featuresText|
+--------------------+
|(262144,[10804,86...|
|(262144,[42343,10...|
|(262144,[138356,1...|
|(262144,[93123,15...|
|(262144,[1911,163...|
|(262144,[18469],[...|
|(262144,[86175,11...|
|(262144,[33933,45...|
|(262144,[38270,61...|
|(262144,[46252,15...|
|(262144,[33209,45...|
|(262144,[16332,37...|
|(262144,[9616,163...|
|(262144,[77142],[...|
|(262144,[127412,2...|
|(262144,[16332,35...|
|(262144,[5381,164...|
|(262144,[16457,10...|
|(262144,[61666,12...|
|(262144,[47237],[...|
+--------------------+
only showing top 20 rows



In [25]:
rescaledData.select("Stars", "featuresText").show()
dat.select("Stars", "Summary").show()


+-----+--------------------+
|Stars|        featuresText|
+-----+--------------------+
|    3|(262144,[37852,41...|
|    5|(262144,[29582,49...|
|    5|(262144,[2692,378...|
|    4|(262144,[8610,156...|
|    3|(262144,[17444,17...|
|    5|(262144,[7367,961...|
|    2|(262144,[103838,1...|
|    5|(262144,[9639,378...|
|    4|(262144,[138356,2...|
|    5|(262144,[2437,918...|
|    5|(262144,[2437,363...|
|    5|(262144,[9639,592...|
|    4|(262144,[16332,23...|
|    5|(262144,[13963,87...|
|    5|(262144,[122823,1...|
|    4|(262144,[96638,13...|
|    5|(262144,[9639,378...|
|    5|(262144,[16332,75...|
|    4|(262144,[78,85161...|
|    4|(262144,[12888,16...|
+-----+--------------------+
only showing top 20 rows

+-----+--------------------+
|Stars|             Summary|
+-----+--------------------+
|    3|My cats love this...|
|    5|"Just what ""The ...|
|    5|My Cat Loves Whis...|
|    4|Two enthusiastic ...|
|    3|      Finniky Kitty!|
|    5|Our favorite groc...|
|    2|stick with

In [26]:
dat.printSchema()


root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Stars: integer (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- TotalHelp: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- wordsText: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawSum: vector (nullable = true)



We used a 10/90 split just to get our data to run as it takes about 45 mins with this split and hours with a proper split. This is something which would be aided by the server.

In [28]:
rescaledData =rescaledData.withColumnRenamed("Stars", "label")
rescaledData =rescaledData.withColumnRenamed("featuresText", "features")
rescaledData.printSchema()
rescaledData = rescaledData.filter(rescaledData.features.isNotNull())
rescaledData = rescaledData.filter(rescaledData.label.isNotNull())
train, test = rescaledData.randomSplit([0.10, 0.90])

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessDenominator: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- TotalHelp: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- wordsText: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawSum: vector (nullable = true)
 |-- rawWords: vector (nullable = true)
 |-- features: vector (nullable = true)



In [29]:
y = train.select("label")
y = test.select("label")


In [30]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
# assembler = VectorAssembler(
#     inputCols='features',
#     outputCol='features')

# train_processed = assembler.transform(train)
# test_processed = assembler.transform(test)

In [32]:
regression_model = LogisticRegression(maxIter=10, tol=1, fitIntercept=True)
ovr = OneVsRest(classifier=regression_model)

ovrModel = ovr.fit(train)


We only tested on 10% of the test data for the same reason: time. 

In [33]:
test_processed, test_p = test.randomSplit([0.10, 0.9])

In [34]:
predictions = ovrModel.transform(test_processed)

In [35]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 0.635963
