Opinion Mining Project on Amazon Product Review Dataset

Import all the libraries and function from SparkSQL and SparkMLib

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#import wget
from pyspark.ml.feature import Bucketizer,RegexTokenizer,StopWordsRemover,CountVectorizer,IDF, NGram, HashingTF, StringIndexer
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression, RandomForestClassificationModel, RandomForestClassifier, GBTClassificationModel, GBTClassifier, NaiveBayes
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator


spark = SparkSession.builder.appName(
    "Opinion mining on Amazon Fashion product reviews"
).getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [0]:
#Create UnionAll function to merge dataframs
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
 
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)
 

In [0]:
# File location and type for Clothing , Shoes and Jewelry data
file_location = "/FileStore/tables/reviews_Clothing_Shoes_and_Jewelry_5_json.gz"
file_type = "gz"

# The applied options are for CSV files. For other file types, these will be ignored.
clsj = spark.read.json(file_location)

In [0]:
# File location and type for beauty data
file_location = "/FileStore/tables/reviews_Beauty_5_json.gz"
file_type = "gz"

# The applied options are for CSV files. For other file types, these will be ignored.
bt = spark.read.json(file_location)

In [0]:
#Execute union to merge bt and clsj dataframes
df  = unionAll(bt,clsj)

In [0]:
#Drop the columns that are not required part 1
df = df.drop("image")\
    .drop("verified")\
    .drop("vote")
df.columns

Out[49]: ['asin',
 'helpful',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime']

In [0]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [0]:
# Drop and columns and concatenate summary and reviewtext as Text column
df = df.withColumn("text",concat(F.col("summary"), lit(" "),F.col("reviewText")))\
 .drop("helpful")\
 .drop("reviewerID")\
 .drop("reviewerName")\
 .drop("reviewTime")

In [0]:
#Print the Count of number of instances 
df.count()

Out[53]: 477179

In [0]:
#Describe the Distribution of the ratings column 'overall' 
df.describe("overall").show()

+-------+------------------+
|summary|           overall|
+-------+------------------+
|  count|            477179|
|   mean|4.2223610007984425|
| stddev|1.1306299814841152|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



In [0]:
#Data Cleaning : Finding Null Values in each column
for col in df.columns:
    print(col,":",df[df[col].isNull()].count())
    

asin : 0
overall : 0
reviewText : 0
summary : 0
unixReviewTime : 0
text : 0


In [0]:
#Filter out the  rows with neutral overall ratings
df1 = df.filter("overall !=3")

#Create a split such that from -inf to 4 will be placed in bucket 0 and 4 to inf will be placed in bucket 1
splits = [-float("inf"), 4.0, float("inf")]

#Bucketize data and create labels 0 if overall rating is in (1.0,2.0), #otherwise 1
bucketizer = Bucketizer(splits=splits,\
                        inputCol="overall", outputCol="label")

df2= bucketizer.transform(df1)

df2.groupBy("overall","label").count().show()

+-------+-----+------+
|overall|label| count|
+-------+-----+------+
|    2.0|  0.0| 26919|
|    5.0|  1.0|277771|
|    1.0|  0.0| 21718|
|    4.0|  1.0| 98098|
+-------+-----+------+



In [0]:
#Diplay the dataframe after droping the columns and creating a new target variable/column 'label' based on overall ratings
df2.show()

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+
|7806397051|    1.0|Very oily and cre...|Don't waste your ...|    1391040000|Don't waste your ...|  0.0|
|7806397051|    4.0|The texture of th...|       great quality|    1378425600|great quality The...|  1.0|
|7806397051|    2.0|I really can't te...|Do not work on my...|    1386460800|Do not work on my...|  0.0|
|7806397051|    5.0|I was very happy ...|  Very nice palette!|    1365984000|Very nice palette...|  1.0|
|7806397051|    1.0|PLEASE DONT DO IT...|              smh!!!|    1376611200|smh!!! PLEASE DON...|  0.0|
|7806397051|    2.0|Chalky,Not Pigmen...|Chalky, Not Pigme...|    1378252800|Chalky, Not Pigme...|  0.0|
|9759091062|    2.0|Did nothing for m...|no Lightening,

In [0]:
#Sample withought replacement to resolve the imbalance in the target column
fractions = {1.0 : .1, 0.0 : 1.0}
df3 = df2.stat.sampleBy("label", fractions, 36)
df3.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|48637|
|  1.0|37519|
+-----+-----+



In [0]:
#Split data as 80-20% Train and Test dataset
splitSeed = 5043
trainingData, testData = df3.randomSplit([.80, 0.20], splitSeed)

NLP Text Preprocessing, Pipelines and Model Build

In [0]:
#Tokenize the sentence based on the regex pattern 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")

#Remove Stop Words that do not contribute in any way to our analysis 
stopwords_remover = StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="reviewTokensUf",outputCol="reviewTokens")

#converts word documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTokens",outputCol="cv",vocabSize=296337)

#IDF model
idf = IDF(inputCol="cv",outputCol="features")

#Logistic Boosted Classifier
lr = LogisticRegression(maxIter=100,regParam=0.02,elasticNetParam=0.3)

#Create a pipeline by combining all the functions we defined above - tokenizer , stopwords_remover, cv, idf, gbtc
steps =  [tokenizer, stopwords_remover, cv, idf, lr]
lr_pipeline = Pipeline(stages=steps)

#fit the training dataset dataset into the pipeline 
model = lr_pipeline.fit(trainingData)

#Obtain the predictions from the model 
predictions = model.transform(testData)


In [0]:
#Call the Binary Classification Evaluator function 
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(lr_predictions)
print('Test Area Under ROC for Linear Regression model with Count Vectorizer is ' , areaUnderROC)

Test Area Under ROC for Linear Regression model with Count Vectorizer is  0.9398334608052311


Build a Vocabulary of all the relevant tokens and display the co-efficients

In [0]:
#Building the vocabulary to explore the coefficients 
vocabulary = model.stages[2].vocabulary
weights = model.stages[-1].coefficients.toArray()
weights = [float(weight) for weight in weights]

schema = StructType([StructField('word', StringType()),
                     StructField('weight', FloatType())
                     ])
cdf = spark.createDataFrame(zip(vocabulary, weights), schema)

In [0]:
#Shows the top 10 positive sentiment tokens
cdf.orderBy(desc("weight")).show(10)

+-----------+----------+
|       word|    weight|
+-----------+----------+
|      great|  0.500755|
|       love| 0.4556638|
|    perfect|0.30859488|
|       nice|0.28766462|
|      loves|0.28668725|
|comfortable|0.24460426|
|  excellent|0.22886382|
|     great!|0.22694735|
|compliments|0.21397798|
|     highly|0.20946273|
+-----------+----------+
only showing top 10 rows



In [0]:
#Shows the top 10 negative sentiment tokens
cdf.orderBy("weight").show(10)

+--------------+-----------+
|          word|     weight|
+--------------+-----------+
|  disappointed| -0.4081317|
|      returned|-0.31633556|
|         waste|-0.29761943|
| disappointing|-0.26205418|
|          poor|-0.23398674|
|     returning|-0.22233732|
| unfortunately|-0.22159778|
|        return|-0.22129418|
|disappointment|-0.20847303|
|         cheap|-0.19569387|
+--------------+-----------+
only showing top 10 rows



In [0]:
#model evaluation and metrics for Logistic Regression  vanila version
lp = predictions.select("label", "prediction")
counttotal = predictions.count()
correct = lp.filter(F.col("label")== F.col("prediction")).count()
wrong = lp.filter(~(F.col("label") == F.col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
ratioCorrect=correct/counttotal



trueneg =( lp.filter(F.col("label") == 0.0).filter(F.col("label") == F.col("prediction")).count()) /counttotal
truepos = (lp.filter(F.col("label") == 1.0).filter(F.col("label") == F.col("prediction")).count())/counttotal
falseneg = (lp.filter(F.col("label") == 0.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal
falsepos = (lp.filter(F.col("label") == 1.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal

precision = truepos / (truepos + falsepos)
recall  = truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
print('accuracy     :', accuracy       )

counttotal   : 17244
correct      : 14993
wrong        : 2251
ratioWrong   : 0.1305381581999536
ratioCorrect : 0.8694618418000464
truen        : 0.518905126420784
truep        : 0.35055671537926236
falsen       : 0.050916260728369286
falsep       : 0.07962189747158432
precision    : 0.8149096791588029
recall       : 0.8731763686263181
accuracy     : 0.8694618418000464


In [0]:
# Display the columns of predictions dataframe after transformation
predictions.printSchema()
display(predictions)

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)
 |-- reviewTokensUf: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- reviewTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cv: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



asin,overall,reviewText,summary,unixReviewTime,text,label,reviewTokensUf,reviewTokens,cv,features,rawPrediction,probability,prediction
9759091062,1.0,"Did nothing for my skin. Used as suggested and no signs of brightness. Wish it worked for me, but it didn't.",Nothing,1392681600,"Nothing Did nothing for my skin. Used as suggested and no signs of brightness. Wish it worked for me, but it didn't.",0.0,"List(nothing, did, nothing, for, my, skin, used, as, suggested, and, no, signs, of, brightness, wish, it, worked, for, me, but, it, didn't)","List(nothing, nothing, skin, used, suggested, signs, brightness, wish, worked)","Map(vectorType -> sparse, length -> 87097, indices -> List(8, 22, 122, 240, 298, 1695, 3030, 8018), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(8, 22, 122, 240, 298, 1695, 3030, 8018), values -> List(2.2293398638773105, 2.2142154371841274, 6.22103204836356, 3.565528418941782, 3.7939449552728037, 5.774624103427492, 6.651963748717203, 8.250228360553178))","Map(vectorType -> dense, length -> 2, values -> List(1.390327565625129, -1.390327565625129))","Map(vectorType -> dense, length -> 2, values -> List(0.80064453198834, 0.19935546801166004))",0.0
B00004TMFE,2.0,"I was so very disappointed with my purchase of the Avalon Thickening shampoo and conditioner. I had such high hopes, but in only two shampooings, my hair was noticeably stripped of oils, had that ""squeaky clean"" feeling you get when using a very harsh product, and it became dry and brittle. It started looking very dull. They use many products that have an oil stripping effect on the hair, like orange oil and lemon oil. You might think because it says, 'oil' that it might add to your hair's shine, but these are ingredients in industrial strength degreasers. This product only damaged my hair. Avalon is only 70% organic and it had a rather unpleasant smell to me. Rather like mosquito repellant or dish soap. Using a harsh product like this is very detrimental for your hair. I know that a lot of people will be drawn to Avalon because of the low price, but if it drastically stripped my extremely oily hair in two shampoos, I can't imagine what it might do to yours. Although my hair is very fine, so perhaps ethnic girls, and women with coarser hair might benefit? I honestly wouldn't recommend it. I'd rather see people get something that will improve their self-esteem immediately, rather than give them new problems to worry about.In desperation I purchased the more expensive Just Naturals system for thinning hair and believe me, it blows Avalon out of the water. In just one shampoo my hair regained it's texture and became so silky it felt like Asian hair. I can't stop touching my hair now and I feel so much more confident. They use incredible ingredients that greatly benefit people with hair loss due to hormonal issues, or who have an increase in DHT. You do pay more than double for that shampoo, but having something that supports your hair, improves the texture, and smells fantastic, was worth it to me. I'm now quite confident in what I'm using and plan to go on using it, regardless of the cost. I'd pay 5x what I'm paying for it now, I love it so much. And their treatment spray is oil based. I use the system twice a day and I'm shocked at how long it is lasting. It has truly been an excellent investment.I would recommend that you shampoo and condition your hair twice a day if you want to see regrowth with any system. This is because one application of shampoo with DHT-fighting ingredients, can only do so much. Your body is constantly generating new hormones and new DHT. By helping to combat it twice a day, you have a better chance of seeing results sooner. This is coming from a 36 year old woman who has been fighting repeated hair loss due to hormonal imbalance from 16 on. Once a day just never cut it for me. Using a leave on treatment like Just Naturals has overnight, can also work wonders. I use it during the day too if I'm not going out.I hope this helps you recover your hair, and your self-esteem, a little faster and more efficiently. :o)","Industrial strength degreaser. ""Fullness"" is caused by stripping oils and making hair fluffy",1401840000,"Industrial strength degreaser. ""Fullness"" is caused by stripping oils and making hair fluffy I was so very disappointed with my purchase of the Avalon Thickening shampoo and conditioner. I had such high hopes, but in only two shampooings, my hair was noticeably stripped of oils, had that ""squeaky clean"" feeling you get when using a very harsh product, and it became dry and brittle. It started looking very dull. They use many products that have an oil stripping effect on the hair, like orange oil and lemon oil. You might think because it says, 'oil' that it might add to your hair's shine, but these are ingredients in industrial strength degreasers. This product only damaged my hair. Avalon is only 70% organic and it had a rather unpleasant smell to me. Rather like mosquito repellant or dish soap. Using a harsh product like this is very detrimental for your hair. I know that a lot of people will be drawn to Avalon because of the low price, but if it drastically stripped my extremely oily hair in two shampoos, I can't imagine what it might do to yours. Although my hair is very fine, so perhaps ethnic girls, and women with coarser hair might benefit? I honestly wouldn't recommend it. I'd rather see people get something that will improve their self-esteem immediately, rather than give them new problems to worry about.In desperation I purchased the more expensive Just Naturals system for thinning hair and believe me, it blows Avalon out of the water. In just one shampoo my hair regained it's texture and became so silky it felt like Asian hair. I can't stop touching my hair now and I feel so much more confident. They use incredible ingredients that greatly benefit people with hair loss due to hormonal issues, or who have an increase in DHT. You do pay more than double for that shampoo, but having something that supports your hair, improves the texture, and smells fantastic, was worth it to me. I'm now quite confident in what I'm using and plan to go on using it, regardless of the cost. I'd pay 5x what I'm paying for it now, I love it so much. And their treatment spray is oil based. I use the system twice a day and I'm shocked at how long it is lasting. It has truly been an excellent investment.I would recommend that you shampoo and condition your hair twice a day if you want to see regrowth with any system. This is because one application of shampoo with DHT-fighting ingredients, can only do so much. Your body is constantly generating new hormones and new DHT. By helping to combat it twice a day, you have a better chance of seeing results sooner. This is coming from a 36 year old woman who has been fighting repeated hair loss due to hormonal imbalance from 16 on. Once a day just never cut it for me. Using a leave on treatment like Just Naturals has overnight, can also work wonders. I use it during the day too if I'm not going out.I hope this helps you recover your hair, and your self-esteem, a little faster and more efficiently. :o)",0.0,"List(industrial, strength, degreaser, fullness, is, caused, by, stripping, oils, and, making, hair, fluffy, i, was, so, very, disappointed, with, my, purchase, of, the, avalon, thickening, shampoo, and, conditioner, i, had, such, high, hopes, but, in, only, two, shampooings, my, hair, was, noticeably, stripped, of, oils, had, that, ""squeaky, clean"", feeling, you, get, when, using, a, very, harsh, product, and, it, became, dry, and, brittle, it, started, looking, very, dull, they, use, many, products, that, have, an, oil, stripping, effect, on, the, hair, like, orange, oil, and, lemon, oil, you, might, think, because, it, says, 'oil', that, it, might, add, to, your, hair's, shine, but, these, are, ingredients, in, industrial, strength, degreasers, this, product, only, damaged, my, hair, avalon, is, only, 70%, organic, and, it, had, a, rather, unpleasant, smell, to, me, rather, like, mosquito, repellant, or, dish, soap, using, a, harsh, product, like, this, is, very, detrimental, for, your, hair, i, know, that, a, lot, of, people, will, be, drawn, to, avalon, because, of, the, low, price, but, if, it, drastically, stripped, my, extremely, oily, hair, in, two, shampoos, i, can't, imagine, what, it, might, do, to, yours, although, my, hair, is, very, fine, so, perhaps, ethnic, girls, and, women, with, coarser, hair, might, benefit?, i, honestly, wouldn't, recommend, it, i'd, rather, see, people, get, something, that, will, improve, their, self-esteem, immediately, rather, than, give, them, new, problems, to, worry, about, in, desperation, i, purchased, the, more, expensive, just, naturals, system, for, thinning, hair, and, believe, me, it, blows, avalon, out, of, the, water, in, just, one, shampoo, my, hair, regained, it's, texture, and, became, so, silky, it, felt, like, asian, hair, i, can't, stop, touching, my, hair, now, and, i, feel, so, much, more, confident, they, use, incredible, ingredients, that, greatly, benefit, people, with, hair, loss, due, to, hormonal, issues, or, who, have, an, increase, in, dht, you, do, pay, more, than, double, for, that, shampoo, but, having, something, that, supports, your, hair, improves, the, texture, and, smells, fantastic, was, worth, it, to, me, i'm, now, quite, confident, in, what, i'm, using, and, plan, to, go, on, using, it, regardless, of, the, cost, i'd, pay, 5x, what, i'm, paying, for, it, now, i, love, it, so, much, and, their, treatment, spray, is, oil, based, i, use, the, system, twice, a, day, and, i'm, shocked, at, how, long, it, is, lasting, it, has, truly, been, an, excellent, investment, i, would, recommend, that, you, shampoo, and, condition, your, hair, twice, a, day, if, you, want, to, see, regrowth, with, any, system, this, is, because, one, application, of, shampoo, with, dht-fighting, ingredients, can, only, do, so, much, your, body, is, constantly, generating, new, hormones, and, new, dht, by, helping, to, combat, it, twice, a, day, you, have, a, better, chance, of, seeing, results, sooner, this, is, coming, from, a, 36, year, old, woman, who, has, been, fighting, repeated, hair, loss, due, to, hormonal, imbalance, from, 16, on, once, a, day, just, never, cut, it, for, me, using, a, leave, on, treatment, like, just, naturals, has, overnight, can, also, work, wonders, i, use, it, during, the, day, too, if, i'm, not, going, out, i, hope, this, helps, you, recover, your, hair, and, your, self-esteem, a, little, faster, and, more, efficiently, :o)","List(industrial, strength, degreaser, fullness, caused, stripping, oils, making, hair, fluffy, disappointed, purchase, avalon, thickening, shampoo, conditioner, high, hopes, two, shampooings, hair, noticeably, stripped, oils, ""squeaky, clean"", feeling, get, using, harsh, product, became, dry, brittle, started, looking, dull, use, many, products, oil, stripping, effect, hair, like, orange, oil, lemon, oil, might, think, says, 'oil', might, add, hair's, shine, ingredients, industrial, strength, degreasers, product, damaged, hair, avalon, 70%, organic, rather, unpleasant, smell, rather, like, mosquito, repellant, dish, soap, using, harsh, product, like, detrimental, hair, know, lot, people, drawn, avalon, low, price, drastically, stripped, extremely, oily, hair, two, shampoos, imagine, might, although, hair, fine, perhaps, ethnic, girls, women, coarser, hair, might, benefit?, honestly, recommend, rather, see, people, get, something, improve, self-esteem, immediately, rather, give, new, problems, worry, desperation, purchased, expensive, naturals, system, thinning, hair, believe, blows, avalon, water, one, shampoo, hair, regained, texture, became, silky, felt, like, asian, hair, stop, touching, hair, feel, much, confident, use, incredible, ingredients, greatly, benefit, people, hair, loss, due, hormonal, issues, increase, dht, pay, double, shampoo, something, supports, hair, improves, texture, smells, fantastic, worth, quite, confident, using, plan, go, using, regardless, cost, pay, 5x, paying, love, much, treatment, spray, oil, based, use, system, twice, day, shocked, long, lasting, truly, excellent, investment, recommend, shampoo, condition, hair, twice, day, want, see, regrowth, system, one, application, shampoo, dht-fighting, ingredients, much, body, constantly, generating, new, hormones, new, dht, helping, combat, twice, day, better, chance, seeing, results, sooner, coming, 36, year, old, woman, fighting, repeated, hair, loss, due, hormonal, imbalance, 16, day, never, cut, using, leave, treatment, like, naturals, overnight, also, work, wonders, use, day, going, hope, helps, recover, hair, self-esteem, little, faster, efficiently, :o)","Map(vectorType -> sparse, length -> 87097, indices -> List(0, 1, 2, 4, 6, 10, 11, 16, 18, 23, 25, 28, 36, 38, 39, 44, 46, 47, 48, 51, 52, 54, 56, 62, 63, 64, 65, 69, 75, 76, 83, 86, 87, 102, 103, 105, 109, 113, 121, 123, 125, 139, 149, 150, 163, 164, 167, 172, 175, 179, 180, 195, 206, 230, 237, 243, 246, 249, 265, 266, 280, 299, 300, 309, 353, 358, 368, 373, 384, 389, 411, 416, 423, 446, 463, 465, 470, 471, 474, 498, 500, 511, 532, 570, 580, 602, 606, 614, 676, 681, 686, 691, 717, 764, 770, 781, 784, 787, 802, 803, 818, 820, 857, 865, 889, 899, 929, 1029, 1030, 1032, 1036, 1044, 1051, 1147, 1194, 1256, 1266, 1375, 1381, 1420, 1464, 1588, 1651, 1755, 1787, 1802, 1814, 1866, 1890, 1981, 2004, 2140, 2222, 2232, 2512, 2513, 2567, 2595, 2610, 2611, 2762, 2877, 3193, 3308, 3325, 3430, 3592, 3689, 3721, 3956, 4135, 4304, 4499, 4562, 4693, 4707, 5015, 5053, 5177, 5269, 5386, 5604, 5650, 5655, 5951, 6152, 6517, 7932, 8256, 8734, 9611, 10150, 11896, 14121, 14798, 14946, 18240, 21198, 22268, 22783, 24848, 29514, 31582, 39492), values -> List(5.0, 17.0, 3.0, 2.0, 4.0, 1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 5.0, 1.0, 1.0, 1.0, 1.0, 2.0, 5.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0, 1.0, 1.0, 1.0, 5.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(0, 1, 2, 4, 6, 10, 11, 16, 18, 23, 25, 28, 36, 38, 39, 44, 46, 47, 48, 51, 52, 54, 56, 62, 63, 64, 65, 69, 75, 76, 83, 86, 87, 102, 103, 105, 109, 113, 121, 123, 125, 139, 149, 150, 163, 164, 167, 172, 175, 179, 180, 195, 206, 230, 237, 243, 246, 249, 265, 266, 280, 299, 300, 309, 353, 358, 368, 373, 384, 389, 411, 416, 423, 446, 463, 465, 470, 471, 474, 498, 500, 511, 532, 570, 580, 602, 606, 614, 676, 681, 686, 691, 717, 764, 770, 781, 784, 787, 802, 803, 818, 820, 857, 865, 889, 899, 929, 1029, 1030, 1032, 1036, 1044, 1051, 1147, 1194, 1256, 1266, 1375, 1381, 1420, 1464, 1588, 1651, 1755, 1787, 1802, 1814, 1866, 1890, 1981, 2004, 2140, 2222, 2232, 2512, 2513, 2567, 2595, 2610, 2611, 2762, 2877, 3193, 3308, 3325, 3430, 3592, 3689, 3721, 3956, 4135, 4304, 4499, 4562, 4693, 4707, 5015, 5053, 5177, 5269, 5386, 5604, 5650, 5655, 5951, 6152, 6517, 7932, 8256, 8734, 9611, 10150, 11896, 14121, 14798, 14946, 18240, 21198, 22268, 22783, 24848, 29514, 31582, 39492), values -> List(5.82279686615753, 36.82815228217995, 4.952207585168528, 3.2937738912189554, 7.12287871681434, 1.8616669550075484, 3.6546844249329475, 6.12525036751957, 2.0655068958811857, 2.182188649219123, 2.261127716374541, 11.990927679619649, 2.397387514973431, 2.3972279871753743, 2.395793381047444, 2.546075584096782, 4.883838101975456, 13.057391780839158, 2.724554109038063, 2.5649493574615367, 2.5606199389343405, 2.8284650108009304, 5.375677574384989, 2.9563653443545217, 5.541031584142635, 2.7018009944611174, 2.7406151279386464, 2.782168219418047, 2.8455509780142316, 5.683623448000284, 2.9033854150998537, 2.899687576760444, 2.895215650328596, 2.9480296472971688, 3.015265031735053, 14.425487168701013, 3.0603627022326405, 3.01822887504279, 3.0700065784998247, 18.32347074540973, 9.691129234055996, 3.159208536869273, 3.739368854036328, 3.3201606031871624, 3.3942994562179036, 3.3445421441332193, 3.4060412740945867, 13.406238867167463, 10.232594361192877, 3.4095469744422156, 3.626345465632933, 3.640623577497221, 3.719422589853949, 3.609047737042054, 3.6456138844988097, 3.6675310304171456, 3.7218192356985487, 3.674372562233862, 3.769739951912627, 3.898517759192381, 7.476297195262197, 4.376715209886908, 3.7722604321379616, 12.624456680630502, 3.9268318103307016, 3.941668877761169, 15.869633503850977, 4.02583067008288, 4.401263491092169, 4.072428118061301, 4.074986754851626, 4.136625981726663, 4.1044516246988065, 8.373831815157612, 4.242895175320707, 4.218941934298215, 8.625941767892982, 12.821000501962383, 4.280985214795141, 4.308646552883488, 4.304340841172276, 4.469833797603469, 4.402447623853385, 4.529904073731584, 4.582402315637074, 9.586421817586665, 4.624407042406379, 4.580984880956101, 4.679131942095625, 4.70605359966189, 4.920009948349604, 4.723867835937017, 4.742005183914135, 4.809098268555652, 4.841650871593401, 4.904230528245638, 4.828865309296428, 4.904230528245638, 4.882932530566704, 4.847180839602862, 4.88101865438442, 4.954391494548849, 4.918023850377974, 5.079143199521326, 5.378548735669166, 10.374713568323116, 5.018107308934956, 5.305789381386738, 16.40183055383355, 5.139185240488192, 5.221706264176197, 5.240702764866851, 10.338676557317761, 10.67696348614456, 5.329459125472642, 5.423572417043121, 5.417015016496962, 11.405041619052295, 5.523829020782771, 5.5876405335277255, 5.6231472219846355, 5.756105055660254, 5.827394139407556, 5.847295293724851, 5.867600559885596, 5.877909929544457, 5.914853444736141, 6.016636139046084, 5.925664360840357, 6.022606306032587, 6.05919575346488, 6.250250990227589, 12.60863642299573, 6.458468891325123, 6.3448095728526015, 12.791335980172185, 6.395667990086093, 6.440119752656926, 6.586723226848802, 6.369915493983678, 6.458468891325123, 13.812987227704166, 6.697948861959026, 7.046255556227242, 6.746150963776904, 13.567782583519502, 7.189356399867916, 6.8921048763999835, 6.921092413273236, 7.063062674543623, 14.378712799735831, 7.170308204897221, 7.402930500165975, 7.451720664335406, 7.427028051745035, 7.269399107541452, 14.903441328670812, 7.451720664335406, 14.954076944639393, 7.614239593833181, 7.4770384723196965, 7.773304288462869, 7.674864215649616, 7.773304288462869, 7.739402736787188, 7.739402736787188, 7.882503580427861, 8.307386774393127, 33.73019966938853, 8.307386774393127, 8.432549917347133, 8.575650760987806, 8.838015025455297, 9.061158576769508, 18.38937993878806, 9.19468996939403, 9.531162206015242, 9.754305757329453, 20.894905875778797, 9.754305757329453, 9.754305757329453, 10.041987829781233, 10.041987829781233, 10.447452937889398))","Map(vectorType -> dense, length -> 2, values -> List(1.856674536175129, -1.856674536175129))","Map(vectorType -> dense, length -> 2, values -> List(0.8649088670947155, 0.13509113290528452))",0.0
B00004TMFE,2.0,"Product dried out my hair. I was disappointed since this is organic. I expected it to be better. But, Pantene works better than this and does not dry my hair out. Smell was ok. It was not overwhelming, but it did not smell great either. It did make my hair look thicker, but dried out.",Dries out my hair,1341792000,"Dries out my hair Product dried out my hair. I was disappointed since this is organic. I expected it to be better. But, Pantene works better than this and does not dry my hair out. Smell was ok. It was not overwhelming, but it did not smell great either. It did make my hair look thicker, but dried out.",0.0,"List(dries, out, my, hair, product, dried, out, my, hair, i, was, disappointed, since, this, is, organic, i, expected, it, to, be, better, but, pantene, works, better, than, this, and, does, not, dry, my, hair, out, smell, was, ok, it, was, not, overwhelming, but, it, did, not, smell, great, either, it, did, make, my, hair, look, thicker, but, dried, out)","List(dries, hair, product, dried, hair, disappointed, since, organic, expected, better, pantene, works, better, dry, hair, smell, ok, overwhelming, smell, great, either, make, hair, look, thicker, dried)","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 2, 3, 17, 38, 48, 50, 62, 72, 80, 102, 153, 223, 227, 555, 669, 675, 889, 1910, 2228), values -> List(4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 2, 3, 17, 38, 48, 50, 62, 72, 80, 102, 153, 223, 227, 555, 669, 675, 889, 1910, 2228), values -> List(8.665447595807047, 1.6507358617228427, 1.5716067601508, 2.0755167591302985, 4.794455974350749, 2.724554109038063, 2.5857258600654167, 5.912730688709043, 2.8400715122496063, 2.913759228040765, 2.9480296472971688, 3.2500175837928063, 3.5543038112966228, 3.6111936606123303, 8.978056489719231, 4.672901392344989, 4.710880640410205, 5.378548735669166, 6.395667990086093, 6.213346433292138))","Map(vectorType -> dense, length -> 2, values -> List(1.533809894112177, -1.533809894112177))","Map(vectorType -> dense, length -> 2, values -> List(0.8225630634891373, 0.17743693651086268))",0.0
B00004TMFE,2.0,"We were pretty disappointed with this shampoo and conditioner. Had high hopes for it, and it did not deliver.The smell was okay.It was harsh and stripping on the hair follicles.Left our hair with that squeaky clean feeling, but not in a good way, more in a removed all the oil from your hair kinda away.Even in using the conditioner, my hair still looked frizzy. And I do not have frizzy hair! Mine is fine, thin, and straight. If anything I usually border on having my hair be too silky and need things to volumize it.Just cannot recommend this product.",Not impressed,1365724800,"Not impressed We were pretty disappointed with this shampoo and conditioner. Had high hopes for it, and it did not deliver.The smell was okay.It was harsh and stripping on the hair follicles.Left our hair with that squeaky clean feeling, but not in a good way, more in a removed all the oil from your hair kinda away.Even in using the conditioner, my hair still looked frizzy. And I do not have frizzy hair! Mine is fine, thin, and straight. If anything I usually border on having my hair be too silky and need things to volumize it.Just cannot recommend this product.",0.0,"List(not, impressed, we, were, pretty, disappointed, with, this, shampoo, and, conditioner, had, high, hopes, for, it, and, it, did, not, deliver, the, smell, was, okay, it, was, harsh, and, stripping, on, the, hair, follicles, left, our, hair, with, that, squeaky, clean, feeling, but, not, in, a, good, way, more, in, a, removed, all, the, oil, from, your, hair, kinda, away, even, in, using, the, conditioner, my, hair, still, looked, frizzy, and, i, do, not, have, frizzy, hair!, mine, is, fine, thin, and, straight, if, anything, i, usually, border, on, having, my, hair, be, too, silky, and, need, things, to, volumize, it, just, cannot, recommend, this, product)","List(impressed, pretty, disappointed, shampoo, conditioner, high, hopes, deliver, smell, okay, harsh, stripping, hair, follicles, left, hair, squeaky, clean, feeling, good, way, removed, oil, hair, kinda, away, even, using, conditioner, hair, still, looked, frizzy, frizzy, hair!, mine, fine, thin, straight, anything, usually, border, hair, silky, need, things, volumize, recommend, product)","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 2, 5, 15, 28, 31, 42, 46, 58, 62, 82, 102, 105, 109, 123, 126, 136, 144, 149, 162, 164, 183, 186, 205, 249, 260, 404, 415, 447, 493, 787, 847, 929, 1060, 1147, 1250, 1775, 2256, 4439, 5015, 8726, 9185, 15620), values -> List(5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 2, 5, 15, 28, 31, 42, 46, 58, 62, 82, 102, 105, 109, 123, 126, 136, 144, 149, 162, 164, 183, 186, 205, 249, 260, 404, 415, 447, 493, 787, 847, 929, 1060, 1147, 1250, 1775, 2256, 4439, 5015, 8726, 9185, 15620), values -> List(10.831809494758808, 1.6507358617228427, 1.6047762367946288, 1.9910718858699175, 2.39818553592393, 2.3064088002472714, 2.497303050237379, 2.441919050987728, 2.703966434891522, 2.9563653443545217, 2.8920709936491247, 2.9480296472971688, 3.606371792175253, 3.0603627022326405, 3.664694149081946, 3.1922148333374434, 3.204297425247383, 3.210753688706839, 7.478737708072656, 3.2971437140732274, 3.3445421441332193, 3.3848329483463453, 3.476722859745872, 3.649512524914467, 3.674372562233862, 3.6870382468059697, 4.063102064880111, 4.137534659662881, 4.277842205397942, 4.311888046807659, 4.904230528245638, 10.076083047671547, 5.018107308934956, 5.229803474408816, 5.33848174307228, 5.436817643793142, 5.837295210390267, 6.198957695840038, 7.269399107541452, 7.451720664335406, 8.368011396209562, 8.432549917347133, 9.19468996939403))","Map(vectorType -> dense, length -> 2, values -> List(3.6108698390173166, -3.6108698390173166))","Map(vectorType -> dense, length -> 2, values -> List(0.9736829786948055, 0.026317021305194532))",0.0
B00004U9UY,5.0,"This is one of my favorites! So glad I took a chance and bought this scent :) I only wish the tube lasted longer! It is such a great, clean scent-not overpowering and the staying power on your hands is amazing-your hands (feet too!) really feel wonderful after lotion is applied!",Love this scent so much :),1386115200,"Love this scent so much :) This is one of my favorites! So glad I took a chance and bought this scent :) I only wish the tube lasted longer! It is such a great, clean scent-not overpowering and the staying power on your hands is amazing-your hands (feet too!) really feel wonderful after lotion is applied!",1.0,"List(love, this, scent, so, much, :, this, is, one, of, my, favorites!, so, glad, i, took, a, chance, and, bought, this, scent, :, i, only, wish, the, tube, lasted, longer!, it, is, such, a, great, clean, scent-not, overpowering, and, the, staying, power, on, your, hands, is, amazing-your, hands, feet, too!, really, feel, wonderful, after, lotion, is, applied!)","List(love, scent, much, :, one, favorites!, glad, took, chance, bought, scent, :, wish, tube, lasted, longer!, great, clean, scent-not, overpowering, staying, power, hands, amazing-your, hands, feet, too!, really, feel, wonderful, lotion, applied!)","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 4, 7, 10, 16, 26, 44, 100, 145, 205, 211, 220, 240, 261, 279, 410, 477, 616, 717, 810, 985, 1344, 1366, 1669, 8977, 11108, 86846), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 4, 7, 10, 16, 26, 44, 100, 145, 205, 211, 220, 240, 261, 279, 410, 477, 616, 717, 810, 985, 1344, 1366, 1669, 8977, 11108, 86846), values -> List(1.5716067601508, 1.6468869456094777, 1.743614218199151, 1.8616669550075484, 2.041750122506523, 2.180646034181244, 2.546075584096782, 3.1904502307973246, 7.007696012027126, 3.649512524914467, 3.5401977839489427, 4.036456025461816, 3.565528418941782, 7.79846767025405, 7.569918030950179, 4.08787906921702, 4.242895175320707, 4.786230077651992, 4.742005183914135, 4.89449335296778, 5.224398055841908, 5.4846083076294905, 5.553351460049093, 5.7653217107651775, 8.368011396209562, 8.742704845650973, 10.447452937889398))","Map(vectorType -> dense, length -> 2, values -> List(-2.418855006128121, 2.418855006128121))","Map(vectorType -> dense, length -> 2, values -> List(0.08174616181905964, 0.9182538381809404))",1.0
B00004U9V2,5.0,"This cream has wax in it, great for the nails, and really makes your skin look years younger! I use it at night",My favorate,1389571200,"My favorate This cream has wax in it, great for the nails, and really makes your skin look years younger! I use it at night",1.0,"List(my, favorate, this, cream, has, wax, in, it, great, for, the, nails, and, really, makes, your, skin, look, years, younger!, i, use, it, at, night)","List(favorate, cream, wax, great, nails, really, makes, skin, look, years, younger!, use, night)","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 6, 7, 8, 17, 99, 119, 143, 262, 281, 1677, 14649), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 6, 7, 8, 17, 99, 119, 143, 262, 281, 1677, 14649), values -> List(1.5716067601508, 1.780719679203585, 1.743614218199151, 2.2293398638773105, 2.0755167591302985, 2.9474763969372755, 3.112144954335091, 3.6934317588493024, 4.157737366980401, 3.8544084037469606, 6.213346433292138, 9.19468996939403))","Map(vectorType -> dense, length -> 2, values -> List(-0.44379480235022184, 0.44379480235022184))","Map(vectorType -> dense, length -> 2, values -> List(0.3908371162238203, 0.6091628837761798))",1.0
B000050B6U,2.0,"I have long, thick, and straight hair. I bought this curling iron because I wanted to curl my hair for special occasions. This curling iron doesn't curl my hair much at all. All it does is make a little wave at the very end of my hair and that is when it's set to the highest setting. Pass this curling iron, especially if you have hard to curl hair.",Doesn't curl my hair much,1376438400,"Doesn't curl my hair much I have long, thick, and straight hair. I bought this curling iron because I wanted to curl my hair for special occasions. This curling iron doesn't curl my hair much at all. All it does is make a little wave at the very end of my hair and that is when it's set to the highest setting. Pass this curling iron, especially if you have hard to curl hair.",0.0,"List(doesn't, curl, my, hair, much, i, have, long, thick, and, straight, hair, i, bought, this, curling, iron, because, i, wanted, to, curl, my, hair, for, special, occasions, this, curling, iron, doesn't, curl, my, hair, much, at, all, all, it, does, is, make, a, little, wave, at, the, very, end, of, my, hair, and, that, is, when, it's, set, to, the, highest, setting, pass, this, curling, iron, especially, if, you, have, hard, to, curl, hair)","List(curl, hair, much, long, thick, straight, hair, bought, curling, iron, wanted, curl, hair, special, occasions, curling, iron, curl, hair, much, make, little, wave, end, hair, set, highest, setting, pass, curling, iron, especially, hard, curl, hair)","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 16, 23, 26, 36, 50, 127, 131, 132, 251, 255, 302, 394, 447, 615, 729, 733, 861, 1167, 1954, 2489, 2945), values -> List(6.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 4.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 16, 23, 26, 36, 50, 127, 131, 132, 251, 255, 302, 394, 447, 615, 729, 733, 861, 1167, 1954, 2489, 2945), values -> List(12.998171393710571, 4.083500245013046, 2.182188649219123, 2.180646034181244, 2.397387514973431, 2.5857258600654167, 3.1108415980085202, 3.167100154424713, 3.221607630284097, 3.6715162335281084, 3.88359741135727, 3.7634664056153886, 14.018704177034966, 4.277842205397942, 19.554784941133818, 15.746867719870718, 4.786230077651992, 5.13424695884761, 5.302869671283403, 6.257798195862972, 6.320318552844306, 6.545480268314753))","Map(vectorType -> dense, length -> 2, values -> List(0.6189959350957723, -0.6189959350957723))","Map(vectorType -> dense, length -> 2, values -> List(0.6499901552579718, 0.35000984474202823))",0.0
B000050B6U,5.0,This is a nice curling iron. It has different settings but I only use one or two. It works great. Nice choice.,"Works great. Thank you. Arrived soon, well packed.",1373846400,"Works great. Thank you. Arrived soon, well packed. This is a nice curling iron. It has different settings but I only use one or two. It works great. Nice choice.",1.0,"List(works, great, thank, you, arrived, soon, well, packed, this, is, a, nice, curling, iron, it, has, different, settings, but, i, only, use, one, or, two, it, works, great, nice, choice)","List(works, great, thank, arrived, soon, well, packed, nice, curling, iron, different, settings, use, one, two, works, great, nice, choice)","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 4, 6, 12, 20, 63, 80, 88, 294, 394, 530, 661, 729, 898, 1849, 2418), values -> List(2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(3, 4, 6, 12, 20, 63, 80, 88, 294, 394, 530, 661, 729, 898, 1849, 2418), values -> List(3.1432135203016, 1.6468869456094777, 1.780719679203585, 1.8421572982516146, 4.223323452415462, 2.7705157920713175, 5.82751845608153, 2.957482039054597, 3.745492571886857, 4.672901392344989, 4.346013537572844, 4.657492766992144, 5.248955906623572, 5.007202075452694, 6.040733690625144, 6.328415763076925))","Map(vectorType -> dense, length -> 2, values -> List(-3.525147445038509, 3.525147445038509))","Map(vectorType -> dense, length -> 2, values -> List(0.028605116980498644, 0.9713948830195014))",1.0
B000050B6U,5.0,"This is the first curling iron i ever used.. and i am not planning to purchase anything else.I had a problem with the Auto on/off button at the beginning since my hand kept on pushing it by mistake, but now that i know the proper way of holding it it doesn't bother me much.i use a heat protectant so i didn't notice any damage to my hair, on the contrary, my curls ended up being soft and shiny!",Loved it!,1346025600,"Loved it! This is the first curling iron i ever used.. and i am not planning to purchase anything else.I had a problem with the Auto on/off button at the beginning since my hand kept on pushing it by mistake, but now that i know the proper way of holding it it doesn't bother me much.i use a heat protectant so i didn't notice any damage to my hair, on the contrary, my curls ended up being soft and shiny!",1.0,"List(loved, it!, this, is, the, first, curling, iron, i, ever, used, and, i, am, not, planning, to, purchase, anything, else, i, had, a, problem, with, the, auto, on/off, button, at, the, beginning, since, my, hand, kept, on, pushing, it, by, mistake, but, now, that, i, know, the, proper, way, of, holding, it, it, doesn't, bother, me, much, i, use, a, heat, protectant, so, i, didn't, notice, any, damage, to, my, hair, on, the, contrary, my, curls, ended, up, being, soft, and, shiny!)","List(loved, it!, first, curling, iron, ever, used, planning, purchase, anything, else, problem, auto, on/off, button, beginning, since, hand, kept, pushing, mistake, know, proper, way, holding, bother, much, use, heat, protectant, notice, damage, hair, contrary, curls, ended, soft, shiny!)","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 6, 16, 22, 31, 37, 69, 72, 74, 139, 144, 148, 177, 253, 310, 316, 342, 394, 441, 464, 473, 499, 592, 725, 729, 807, 813, 1049, 1074, 1813, 1975, 2460, 3908, 4373, 4539, 4673, 7104, 7442), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(1, 6, 16, 22, 31, 37, 69, 72, 74, 139, 144, 148, 177, 253, 310, 316, 342, 394, 441, 464, 473, 499, 592, 725, 729, 807, 813, 1049, 1074, 1813, 1975, 2460, 3908, 4373, 4539, 4673, 7104, 7442), values -> List(2.1663618989517617, 1.780719679203585, 2.041750122506523, 2.2142154371841274, 2.3064088002472714, 2.392295206069719, 2.782168219418047, 2.8400715122496063, 2.865478488232834, 3.159208536869273, 3.210753688706839, 3.2370038288350416, 3.3844047745012253, 3.6450581745650865, 3.8082309125202807, 3.9612921489453083, 3.868896411561975, 4.672901392344989, 4.154958300810135, 4.242895175320707, 4.505966760587657, 4.304340841172276, 4.827052072172248, 4.7436704632331965, 5.248955906623572, 5.067555584348938, 5.0247079929663085, 5.164249209151409, 5.1951795098427676, 5.872741959386015, 5.9701161234111915, 6.288569854529726, 6.981717035089671, 7.248779820338716, 7.402930500165975, 7.402930500165975, 7.9625462881013975, 8.09607768072592))","Map(vectorType -> dense, length -> 2, values -> List(0.5909814928559289, -0.5909814928559289))","Map(vectorType -> dense, length -> 2, values -> List(0.6435903140292402, 0.35640968597075984))",0.0
B000052WY7,2.0,The pencil part is so hard I cannot line the rim of my eyes at all and it's also difficult to use it on my eyelids. It's fine for the eyebrow if that's all you want it for.,almost unusuable,1387670400,almost unusuable The pencil part is so hard I cannot line the rim of my eyes at all and it's also difficult to use it on my eyelids. It's fine for the eyebrow if that's all you want it for.,0.0,"List(almost, unusuable, the, pencil, part, is, so, hard, i, cannot, line, the, rim, of, my, eyes, at, all, and, it's, also, difficult, to, use, it, on, my, eyelids, it's, fine, for, the, eyebrow, if, that's, all, you, want, it, for)","List(almost, unusuable, pencil, part, hard, line, rim, eyes, also, difficult, use, eyelids, fine, eyebrow, want)","Map(vectorType -> sparse, length -> 87097, indices -> List(6, 18, 75, 109, 131, 166, 198, 242, 286, 490, 1348, 2392, 2499, 5874), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 87097, indices -> List(6, 18, 75, 109, 131, 166, 198, 242, 286, 490, 1348, 2392, 2499, 5874), values -> List(1.780719679203585, 2.0655068958811857, 2.8455509780142316, 3.0603627022326405, 3.167100154424713, 3.305812011061149, 3.7888002493915662, 3.656231475163212, 3.8333978036846044, 4.3054155323020415, 5.7840138437773305, 6.413212299737002, 6.48663976829182, 7.739402736787188))","Map(vectorType -> dense, length -> 2, values -> List(0.9225617263479777, -0.9225617263479777))","Map(vectorType -> dense, length -> 2, values -> List(0.7155637874329777, 0.28443621256702234))",0.0


In [0]:
#Model Logistic Regrssion with Bigrams and Hashing TF with IDF vectorization

#Create a pipeline by combining all the functions we defined above - tokenizer , stopwords_remover, cv, idf, gbtc

#Tokenize the sentence based on the regex pattern 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")


bigram = NGram(inputCol = "reviewTokensUf", outputCol = "bigrams", n = 2)


tfs  = HashingTF(inputCol="bigrams", outputCol="h_features")

#IDF model
idf = IDF(inputCol="h_features",outputCol="features")

lr = LogisticRegression(maxIter=20)

steps =  [tokenizer, stopwords_remover, bigram, tfs, idf, lr]
bigrams_pipeline = Pipeline(stages=steps)

model = bigrams_pipeline.fit(trainingData)
bi_predictions = model.transform(testData)

In [0]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(bi_predictions)
print('Test Area Under ROC for Bigrams linear regression', areaUnderROC)

Test Area Under ROC for Bigrams linear regression 0.9467960427315577


In [0]:
#model evaluation
lp = bi_predictions.select("label", "prediction")
counttotal = bi_predictions.count()
correct = lp.filter(F.col("label")== F.col("prediction")).count()
wrong = lp.filter(~(F.col("label") == F.col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
ratioCorrect=correct/counttotal



trueneg =( lp.filter(F.col("label") == 0.0).filter(F.col("label") == F.col("prediction")).count()) /counttotal
truepos = (lp.filter(F.col("label") == 1.0).filter(F.col("label") == F.col("prediction")).count())/counttotal
falseneg = (lp.filter(F.col("label") == 0.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal
falsepos = (lp.filter(F.col("label") == 1.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal

precision = truepos / (truepos + falsepos)
recall  = truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 17244
correct      : 15314
wrong        : 1930
ratioWrong   : 0.11192298770586871
ratioCorrect : 0.8880770122941313
truen        : 0.5028995592669914
truep        : 0.38517745302713985
falsen       : 0.06692182788216192
falsep       : 0.045001159823706796
precision    : 0.8953895928821785
recall       : 0.8519753719856337
accuracy     : 0.8880770122941313


In [0]:
# BiGRams created after the transformation
bi_predictions.select(F.col("bigrams")).show(5)

+--------------------+
|             bigrams|
+--------------------+
|[nothing did, did...|
|[industrial stren...|
|[dries out, out m...|
|[not impressed, i...|
|[love this, this ...|
+--------------------+
only showing top 5 rows



d
RandomForest Implementation

In [0]:
#RandomForest Implementation 
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

#Remove Stop Words that do not contribute in any way to our analysis 
stopwords_remover_rf = \
StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="words",outputCol="filtered")

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)


label_stringIdx = StringIndexer(inputCol = "label", outputCol = "new_label")

rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="new_label", seed=42, \
                            leafCol="leafId")

rf = RandomForestClassifier(labelCol="new_label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

steps =  [regexTokenizer, stopwords_remover_rf, countVectors, label_stringIdx, rf]
randomforests_pipeline = Pipeline(stages=steps)

odel = randomforests_pipeline.fit(trainingData)
rf_predictions = model.transform(testData)

In [0]:
rf_predictions.printSchema()
rf_predictions.show(5)

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)
 |-- reviewTokensUf: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- reviewTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- bigrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- h_features: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------

In [0]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(rf_predictions)
print('Test Area Under ROC for Random Forests Classification is ', areaUnderROC)

Test Area Under ROC for Random Forests Classification is  0.8941074905567711


In [0]:
y_pred=rf_predictions.select("prediction").collect()
y_orig=rf_predictions.select("label").collect()

cf = classification_report(y_pred, y_orig)
print(cf)


              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90      9448
         1.0       0.90      0.85      0.87      7796

    accuracy                           0.89     17244
   macro avg       0.89      0.88      0.89     17244
weighted avg       0.89      0.89      0.89     17244



In [0]:
#model evaluation
lp = rf_predictions.select("label", "prediction")
counttotal = rf_predictions.count()
correct = lp.filter(F.col("label")== F.col("prediction")).count()
wrong = lp.filter(~(F.col("label") == F.col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
ratioCorrect=correct/counttotal



trueneg =( lp.filter(F.col("label") == 0.0).filter(F.col("label") == F.col("prediction")).count()) /counttotal
truepos = (lp.filter(F.col("label") == 1.0).filter(F.col("label") == F.col("prediction")).count())/counttotal
falseneg = (lp.filter(F.col("label") == 0.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal
falsepos = (lp.filter(F.col("label") == 1.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal

precision = truepos / (truepos + falsepos)
recall  = truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 17244
correct      : 10028
wrong        : 7216
ratioWrong   : 0.4184643934122013
ratioCorrect : 0.5815356065877987
truen        : 0.5695314312224542
truep        : 0.012004175365344467
falsen       : 0.00028995592669914174
falsep       : 0.4181744374855022
precision    : 0.027905095713130223
recall       : 0.9764150943396226
accuracy     : 0.5815356065877987


Naive Bayes

In [0]:
nb = NaiveBayes(smoothing=1)
steps =  [regexTokenizer, stopwords_remover_rf, countVectors, label_stringIdx, nb]

nb_pipeline = Pipeline(stages=steps)
model = nb_pipeline.fit(trainingData)
nb_predictions = model.transform(testData)

In [0]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(nb_predictions)
print('Test Area Under ROC for Naive Bayes Classification is ', areaUnderROC)

Test Area Under ROC for Naive Beyes Classification is  0.5399622753791408


In [0]:
#model evaluation
lp = nb_predictions.select("label", "prediction")
counttotal = nb_predictions.count()
correct = lp.filter(F.col("label")== F.col("prediction")).count()
wrong = lp.filter(~(F.col("label") == F.col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
ratioCorrect=correct/counttotal



trueneg =( lp.filter(F.col("label") == 0.0).filter(F.col("label") == F.col("prediction")).count()) /counttotal
truepos = (lp.filter(F.col("label") == 1.0).filter(F.col("label") == F.col("prediction")).count())/counttotal
falseneg = (lp.filter(F.col("label") == 0.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal
falsepos = (lp.filter(F.col("label") == 1.0).filter(~(F.col("label") == F.col("prediction"))).count())/counttotal

precision = truepos / (truepos + falsepos)
recall  = truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 17244
correct      : 15240
wrong        : 2004
ratioWrong   : 0.116214335421016
ratioCorrect : 0.883785664578984
truen        : 0.5045813036418464
truep        : 0.37920436093713755
falsen       : 0.0652400835073069
falsep       : 0.050974251913709114
precision    : 0.881504448638447
recall       : 0.8532098121085595
accuracy     : 0.883785664578984


In [0]:
#GBTClassifier : Gradiennt Boosted Trees

#Tokenize the sentence based on the regex pattern 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")


#Remove Stop Words that do not contribute in any way to our analysis 
stopwords_remover = StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="reviewTokensUf",outputCol="reviewTokens")

#converts word documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTokens",outputCol="cv",vocabSize=296337)



steps =  [tokenizer, stopwords_remover, cv]
Pipeline_mo = Pipeline(stages=steps)
transformed_model = Pipeline_mo.fit(trainingData)
train_tr = transformed_model.transform(trainingData)
test_tr = transformed_model.transform(testData)


In [0]:
train_tr.printSchema()

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)
 |-- reviewTokensUf: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- reviewTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cv: vector (nullable = true)



In [0]:
train_tr.show(5)

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+
|7806397051|    1.0|PLEASE DONT DO IT...|              smh!!!|    1376611200|smh!!! PLEASE DON...|  0.0|[smh!!!, please, ...|[smh!!!, please, ...|(87097,[0,4,5,9,4...|
|7806397051|    1.0|Very oily and cre...|Don't waste your ...|    1391040000|Don't waste your ...|  0.0|[don't, waste, yo...|[waste, money, oi...|(87097,[43,55,67,...|
|7806397051|    2.0|Chalky,Not Pigmen...|Chalky, Not Pigme...|    1378252800|Chalky, Not Pigme...|  0.0|[chalky, not, pig...|[chalky, pigmente...|(87097,[46,103

In [0]:
# training  
gbtc = GBTClassifier(featuresCol="cv", labelCol="label", maxIter=20)
gbtc = gbtc.fit(train_tr)

# prediction
pred = gbtc.transform(test_tr)
pred.show(3) 

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|9759091062|    1.0|Did nothing for m...|             Nothing|    1392681600|Nothing Did nothi...|  0.0|[nothing, did, no...|[nothing, nothing...|(87097,[8,22,122,...|[0.59522853597419...|[0.76682280121619...|       0.0|
|B00004TMFE|    2.0|I was so very dis...|Industrial streng...|    1401840000|Industrial streng...|  0.0|[industrial,

In [0]:
print(gbtc)

GBTClassificationModel: uid = GBTClassifier_b7bb4030176d, numTrees=20, numClasses=2, numFeatures=87097


In [0]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(pred)
print('Test Area Under ROC for Gradient Boost Classification is ', areaUnderROC)

Test Area Under ROC for Naive Bayes Classification is  0.8692959846983227


In [0]:
pred.show(5)

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|9759091062|    1.0|Did nothing for m...|             Nothing|    1392681600|Nothing Did nothi...|  0.0|[nothing, did, no...|[nothing, nothing...|(87097,[8,22,122,...|[0.59522853597419...|[0.76682280121619...|       0.0|
|B00004TMFE|    2.0|I was so very dis...|Industrial streng...|    1401840000|Industrial streng...|  0.0|[industrial,

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
#e
acc = evaluator.evaluate(pred)
 
print("Prediction Accuracy: ", acc)

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 



Prediction Accuracy:  0.7811992577128276
Confusion Matrix:
[[8332 1494]
 [2279 5139]]


In [0]:
from sklearn.metrics import confusion_matrix, classification_report

cf = classification_report(y_orig, y_pred)
print(cf)

              precision    recall  f1-score   support

         0.0       0.79      0.85      0.82      9826
         1.0       0.77      0.69      0.73      7418

    accuracy                           0.78     17244
   macro avg       0.78      0.77      0.77     17244
weighted avg       0.78      0.78      0.78     17244

