# 1. Loading Libraries and create spark session

In [37]:
#importation des modules
import time
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

#create Spark session
appName = "Sentiment Analysis in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


### Read data file into Spark dataFrame

In [2]:
#Specifiez votre path
df= spark.read.csv('./tripadvisor_hotel_reviews.csv', inferSchema=True, header=True)

AnalysisException: Path does not exist: file:/C:/Jupyter/tripadvisor_hotel_reviews.csv

## Basic visualization of dataset

In [39]:
df.count()

20491

In [40]:
df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Rating: integer (nullable = true)



In [41]:
df.summary().show()

+-------+--------------------+------------------+
|summary|              Review|            Rating|
+-------+--------------------+------------------+
|  count|               20491|             20491|
|   mean|                null| 3.952222927138744|
| stddev|                null|1.2330297776950543|
|    min|1 best hotels new...|                 1|
|    25%|                null|                 3|
|    50%|                null|                 4|
|    75%|                null|                 5|
|    max|zero hip mid-bost...|                 5|
+-------+--------------------+------------------+



In [42]:
df.show()

+--------------------+------+
|              Review|Rating|
+--------------------+------+
|nice hotel expens...|     4|
|ok nothing specia...|     2|
|nice rooms not 4*...|     3|
|unique, great sta...|     5|
|great stay great ...|     5|
|love monaco staff...|     5|
|cozy stay rainy c...|     5|
|excellent staff, ...|     4|
|hotel stayed hote...|     5|
|excellent stayed ...|     5|
|poor value stayed...|     2|
|nice value seattl...|     4|
|nice hotel good l...|     4|
|nice hotel not ni...|     3|
|great hotel night...|     4|
|horrible customer...|     1|
|disappointed say ...|     2|
|fantastic stay mo...|     5|
|good choice hotel...|     5|
|hmmmmm say really...|     3|
+--------------------+------+
only showing top 20 rows



In [43]:
from pyspark.sql.functions import *

#si Rating = 5,4,3 le rendre dans la table égale a 1
#si Rating = 0,1,2 le rendre dans la table égale a 0

df = df.withColumn('Rating', regexp_replace('Rating', '0', '0'))
df = df.withColumn('Rating', regexp_replace('Rating', '1', '0'))
df = df.withColumn('Rating', regexp_replace('Rating', '2', '0'))

df = df.withColumn('Rating', regexp_replace('Rating', '5', '1'))
df = df.withColumn('Rating', regexp_replace('Rating', '4', '1'))
df = df.withColumn('Rating', regexp_replace('Rating', '3', '1'))

In [44]:
df.show()

+--------------------+------+
|              Review|Rating|
+--------------------+------+
|nice hotel expens...|     1|
|ok nothing specia...|     0|
|nice rooms not 4*...|     1|
|unique, great sta...|     1|
|great stay great ...|     1|
|love monaco staff...|     1|
|cozy stay rainy c...|     1|
|excellent staff, ...|     1|
|hotel stayed hote...|     1|
|excellent stayed ...|     1|
|poor value stayed...|     0|
|nice value seattl...|     1|
|nice hotel good l...|     1|
|nice hotel not ni...|     1|
|great hotel night...|     1|
|horrible customer...|     0|
|disappointed say ...|     0|
|fantastic stay mo...|     1|
|good choice hotel...|     1|
|hmmmmm say really...|     1|
+--------------------+------+
only showing top 20 rows



## Select the related data

In [45]:
data = df.select("Review", col("Rating").cast("Int").alias("label"))
data.show()

+--------------------+-----+
|              Review|label|
+--------------------+-----+
|nice hotel expens...|    1|
|ok nothing specia...|    0|
|nice rooms not 4*...|    1|
|unique, great sta...|    1|
|great stay great ...|    1|
|love monaco staff...|    1|
|cozy stay rainy c...|    1|
|excellent staff, ...|    1|
|hotel stayed hote...|    1|
|excellent stayed ...|    1|
|poor value stayed...|    0|
|nice value seattl...|    1|
|nice hotel good l...|    1|
|nice hotel not ni...|    1|
|great hotel night...|    1|
|horrible customer...|    0|
|disappointed say ...|    0|
|fantastic stay mo...|    1|
|good choice hotel...|    1|
|hmmmmm say really...|    1|
+--------------------+-----+
only showing top 20 rows



### Removing rows with missing values

In [46]:
data.na.drop()

DataFrame[Review: string, label: int]

## Divide data into training and testing data¶

In [47]:
#diviser le dataset en 70% trainingdata et 30% testdata
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] 
testingData = dividedData[1] 
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Nombre de TRAINING DATA", train_rows, "; Nombre de TEST DATA", test_rows)

Nombre de TRAINING DATA 14245 ; Nombre de TEST DATA 6246


# 2. Data Cleaning and Text Preprocessing

In [48]:
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline


In [49]:
#Separattion en utilisant tokenizer

tokenizer = Tokenizer(inputCol="Review", outputCol="Words")

In [50]:
#suppression des stop words(anglais) 
#MeaningfulWords pour dire qu'on prend que les mots significatifs
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="MeaningfulWords") 

In [51]:
#Convertion des mots en numéro

hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="rawFeatures", numFeatures=300)

In [52]:
#IDF (Inverse Document Frequency)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [53]:
#creation du piepline
pipeline = Pipeline(stages=[tokenizer,swr, hashTF,idf])
model = pipeline.fit(trainingData)

In [54]:
#transformation sur dataframe
data=model.transform(trainingData)
data.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|              Review|label|               Words|     MeaningfulWords|         rawFeatures|            features|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|1 spot location f...|    1|[1, spot, locatio...|[1, spot, locatio...|(300,[6,7,11,13,1...|(300,[6,7,11,13,1...|
|1 star best locat...|    0|[1, star, best, l...|[1, star, best, l...|(300,[0,4,16,27,3...|(300,[0,4,16,27,3...|
|10 points, 10 poi...|    1|[10, points,, 10,...|[10, points,, 10,...|(300,[16,44,54,58...|(300,[16,44,54,58...|
|10/10 impressed, ...|    1|[10/10, impressed...|[10/10, impressed...|(300,[0,1,4,9,16,...|(300,[0,1,4,9,16,...|
|100 perfect husba...|    1|[100, perfect, hu...|[100, perfect, hu...|(300,[0,3,4,6,8,1...|(300,[0,3,4,6,8,1...|
|100 stays 18 mont...|    1|[100, stays, 18, ...|[100, stays, 18, ...|(300,[0,4,6,27,33...|(300,

In [55]:
data=data.drop("Words").drop("rawFeatures")

In [56]:
#fd = data.select("label","features").rdd.map(parsePoint)
#fd.cache()

# 1. SVM Model

### Train the model using training data

In [57]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
t0 = time.time()
lsvcModel = lsvc.fit(data)
t1 = time.time()

print("Fin du training")
#https://spark.apache.org/docs/latest/ml-classification-regression.html

Fin du training


### Prepare testing data

In [58]:
model = pipeline.fit(testingData)
test=model.transform(testingData).drop("Words").drop("rawFeatures")

### Predict testing data

In [59]:
t2 = time.time()
predictions= lsvcModel.transform(test)
t3 = time.time()


In [60]:
predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|              Review|label|     MeaningfulWords|            features|       rawPrediction|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|100 star rating, ...|    1|[100, star, ratin...|(300,[0,4,13,16,2...|[-1.1260419750497...|       1.0|
|15 group lovely t...|    1|[15, group, lovel...|(300,[4,11,58,66,...|[-0.7300255947919...|       1.0|
|1925 elegance opu...|    1|[1925, elegance, ...|(300,[0,13,16,33,...|[-0.7964480115357...|       1.0|
|2 star hotel 4 st...|    0|[2, star, hotel, ...|(300,[19,24,30,70...|[-0.4298052551710...|       1.0|
|3 stays feel comp...|    1|[3, stays, feel, ...|(300,[0,1,2,4,6,1...|[-2.7945766636415...|       1.0|
|3rd time stayed v...|    1|[3rd, time, staye...|(300,[4,17,20,28,...|[-0.9033231195475...|       1.0|
|5 star service gr...|    1|[5, star, service...|(300,[0,4,7,12,13...|[-3

#### execution time

In [61]:
time_linear_train = t1-t0
time_linear_predict = t3-t2

print("Temps de training: %fs; Temps de prediction: %fs" % (time_linear_train, time_linear_predict))

Temps de training: 7.156293s; Temps de prediction: 0.063727s


#### calculate the accuracy model

In [62]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Precision = " + str(accuracy))

Precision = 0.856227985910983


# 1. NaiveBayes Model

### Train the model using training data

In [63]:
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()
#utilisation du naive bayes sur training data après prétraitement

t0 = time.time()
NBmodel = nb.fit(data)
t1 = time.time()

print("Fin du training")
#https://spark.apache.org/docs/latest/ml-classification-regression.html

Fin du training


### Predict testing data

In [64]:
t2 = time.time()
NB_predictions = NBmodel.transform(test)
t3 = time.time()

In [65]:
NB_predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|              Review|label|     MeaningfulWords|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|100 star rating, ...|    1|[100, star, ratin...|(300,[0,4,13,16,2...|[-455.96914057742...|[2.28367424401234...|       1.0|
|15 group lovely t...|    1|[15, group, lovel...|(300,[4,11,58,66,...|[-100.87373408807...|[0.02286460027457...|       1.0|
|1925 elegance opu...|    1|[1925, elegance, ...|(300,[0,13,16,33,...|[-338.80073009137...|[0.01762810776677...|       1.0|
|2 star hotel 4 st...|    0|[2, star, hotel, ...|(300,[19,24,30,70...|[-232.02906507727...|[0.60556180812210...|       0.0|
|3 stays feel comp...|    1|[3, stays, feel, ...|(300,[0,1,2,4,6,1...|[-785.69616366993...|[5.78070194370217...|       1.0|
|3rd tim

#### execution time

In [66]:
time_linear_train = t1-t0
time_linear_predict = t3-t2

print("Fin du training %fs; Temps de prediction: %fs" % (time_linear_train, time_linear_predict))

Fin du training 3.857038s; Temps de prediction: 0.075362s


#### calculate the accuracy model

In [67]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(NB_predictions)
print("Precision = " + str(accuracy))

Precision = 0.8603906500160102


# 1. RANDOM FOREST Model

### Train the model using training data

In [68]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')

t0 = time.time()
rfModel = rf.fit(data)
t1 = time.time()

print("training is done")
#https://spark.apache.org/docs/latest/ml-classification-regression.html

training is done


### Predict testing data

In [69]:
t2 = time.time()
RF_predictions = rfModel.transform(test)
t3 = time.time()

In [70]:
RF_predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|              Review|label|     MeaningfulWords|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|100 star rating, ...|    1|[100, star, ratin...|(300,[0,4,13,16,2...|[3.12752129263693...|[0.15637606463184...|       1.0|
|15 group lovely t...|    1|[15, group, lovel...|(300,[4,11,58,66,...|[2.60960770343154...|[0.13048038517157...|       1.0|
|1925 elegance opu...|    1|[1925, elegance, ...|(300,[0,13,16,33,...|[3.06494339439460...|[0.15324716971973...|       1.0|
|2 star hotel 4 st...|    0|[2, star, hotel, ...|(300,[19,24,30,70...|[3.38375617359375...|[0.16918780867968...|       1.0|
|3 stays feel comp...|    1|[3, stays, feel, ...|(300,[0,1,2,4,6,1...|[2.08256511352616...|[0.10412825567630...|       1.0|
|3rd tim

#### execution time

In [71]:
time_linear_train = t1-t0
time_linear_predict = t3-t2

print("Temps de training: %fs; Temps de prediction: %fs" % (time_linear_train, time_linear_predict))

Temps de training: 11.625375s; Temps de prediction: 0.166021s


#### calculate the accuracy model

In [72]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(RF_predictions)
print("Precision = " + str(accuracy))

Precision = 0.8395773294908742
