# Sentiment analysis with naive Bayes on Spark

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.model_selection import train_test_split

spark=SparkSession\
    .builder\
    .master('spark://0.0.0.0:7077')\
    .appName('NaiveBayesClassifier')\
    .getOrCreate()

25/03/15 03:45:53 WARN Utils: Your hostname, codespaces-a85663 resolves to a loopback address: 127.0.0.1; using 10.0.1.238 instead (on interface eth0)
25/03/15 03:45:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/15 03:45:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 1. Load data

In [2]:
data_df=pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
data_df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


## 2. Train-test split

In [3]:
training_df, testing_df=train_test_split(
    data_df,
    test_size=0.3,
    random_state=315
)

## 3. Convert to Spark dataframe

In [4]:
training_sdf=spark.createDataFrame(data_df[['review', 'polarity']])
training_sdf=training_sdf.withColumn('review_array', split(training_sdf['review'], ' '))

testing_sdf=spark.createDataFrame(data_df[['review', 'polarity']])
testing_sdf=testing_sdf.withColumn('review_array', split(testing_sdf['review'], ' '))

training_sdf.show()

                                                                                

+--------------------+--------+--------------------+
|              review|polarity|        review_array|
+--------------------+--------+--------------------+
| privacy at least...|       0|[, privacy, at, l...|
| messenger issues...|       0|[, messenger, iss...|
| profile any time...|       0|[, profile, any, ...|
| the new features...|       0|[, the, new, feat...|
| forced reload on...|       0|[, forced, reload...|
| idk i can't edit...|       0|[, idk, i, can't,...|
| major flaws cons...|       0|[, major, flaws, ...|
| video issues sin...|       0|[, video, issues,...|
|  this update com...|       0|[, , this, update...|
| posting issues f...|       0|[, posting, issue...|
| what the heck?! ...|       0|[, what, the, hec...|
| its suck. any ti...|       0|[, its, suck., an...|
| connection issue...|       0|[, connection, is...|
| external links a...|       0|[, external, link...|
| fix this please....|       0|[, fix, this, ple...|
| buggy when i edi...|       0|[, buggy, when,

## 4. Create and fit spark pipeline

In [5]:
vectorizer=CountVectorizer(inputCol='review_array', outputCol='features')
naive_bayes=NaiveBayes(smoothing=1.0, modelType='multinomial')
naive_bayes.setFeaturesCol('features')
naive_bayes.setLabelCol('polarity')
pipeline=Pipeline(stages=[vectorizer, naive_bayes])

model=pipeline.fit(training_sdf)

                                                                                

## 5. Evaluate accuracy on test set

In [6]:
predictions=model.transform(testing_sdf)

evaluator=MulticlassClassificationEvaluator(
    labelCol='polarity',
    predictionCol='prediction',
    metricName='accuracy'
)

accuracy=evaluator.evaluate(predictions)
print(f'Test set accuracy: {accuracy*100:.1f}%')

[Stage 8:>                                                          (0 + 2) / 2]

Test set accuracy: 96.4%


                                                                                

## 6. End SparkSession

In [7]:
spark.stop()