## packages

In [1]:
!pip install findspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl siz

##SetUp

In [2]:
from sklearn.decomposition import SparsePCA
import seaborn as sns
import pandas as pd

In [3]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [4]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
from pyspark.ml.classification import LinearSVC, LinearSVCModel

In [6]:
import findspark
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.column import _to_java_column, _to_seq, Column
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDFModel, IDF, PCA, VectorAssembler
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, BooleanType, ArrayType
from pyspark.ml.feature import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import pyspark.sql.functions as sqlf

In [7]:
from google.colab import drive

In [8]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [9]:
import os

In [10]:
os.getcwd()

'/content'

In [11]:
os.chdir('drive/MyDrive/SteamReviews2021Project')

In [12]:
os.listdir()

['data',
 'BDA_Project_Preprocessing.ipynb',
 'models',
 'sanitize_original_dataset.ipynb',
 'Plotting_vectorization.ipynb',
 'SVM_pipelines.ipynb',
 'Metrics&Results_pipeline.ipynb',
 'Preprocessing+tokenization.ipynb',
 'Model_test_pipeline.ipynb',
 'BayesBDA.ipynb']

##constants

In [13]:
non_null_schema = StructType() \
    .add("#", IntegerType(), True) \
    .add("app_id", IntegerType(), True) \
    .add("app_name", StringType(), True) \
    .add("review_id", IntegerType(), True) \
    .add("language", StringType(), True) \
    .add("review", StringType(), True) \
    .add("timestamp_created", IntegerType(), True) \
    .add("timestamp_updated", IntegerType(), True) \
    .add("recommended", BooleanType(), True) \
    .add("votes_helpful", IntegerType(), True) \
    .add("votes_funny", IntegerType(), True) \
    .add("weighted_vote_score", FloatType(), True) \
    .add("comment_count", IntegerType(), True) \
    .add("steam_purchase", BooleanType(), True) \
    .add("received_for_free", BooleanType(), True) \
    .add("written_during_early_access", BooleanType(), True) \
    .add("author_steamid", IntegerType(), True) \
    .add("author_num_games_owned", IntegerType(), True) \
    .add("author_num_reviews", IntegerType(), True) \
    .add("author_playtime_forever", FloatType(), True) \
    .add("author_playtime_last_two_weeks", FloatType(), True) \
    .add("author_playtime_at_review", FloatType(), True) \
    .add("author_last_played", IntegerType(), True)

In [14]:
INDEX = "#"
APP_ID = "app_id"
APP_NAME = "app_name"
REVIEW_ID = "review_id"
LANGUAGE = "language"
REVIEW = "review"
TIMESTAMP_CREATED = "timestamp_created"
TIMESTAMP_UPDATED = "timestamp_updated"
RECOMMENDED = "recommended"
VOTES_HELPFUL = "votes_helpful"
VOTES_FUNNY = "votes_funny"
WEIGHTED_VOTE_SCORE = "weighted_vote_score"
COMMENT_COUNT = "comment_count"
STEAM_PURCHASE = "steam_purchase"
RECEIVED_FOR_FREE = "received_for_free"
WRITTEN_DURING_EARLY_ACCESS = "written_during_early_access"
AUTHOR_STEAMID = "author_steamid"
AUTHOR_NUM_GAMES_OWNED = "author_num_games_owned"
AUTHOR_NUM_REVIEWS = "author_num_reviews"
AUTHOR_PLAYTIME_FOREVER = "author_playtime_forever"
AUTHOR_PLAYTIME_LAST_TWO_WEEKS = "author_playtime_last_two_weeks"
AUTHOR_PLAYTIME_AT_REVIEW = "author_playtime_at_review"
AUTHOR_LAST_PLAYED = "author_last_played"

In [15]:
AUX_COL = "aux"

In [16]:
vectorization_schema = StructType() \
    .add("review_id", IntegerType(), True) \
    .add("recommended", IntegerType(), True) \
    .add("review", VectorUDT(), True)

## vectorization

In [17]:
findspark.init()
spark = SparkSession.builder.master("local") \
                    .appName("Plotting vectorizations") \
                    .config("spark.executor.cores", "8") \
                    .config("spark.executor.memory", "8g") \
                    .getOrCreate()

In [18]:
IDF_DATASET_PATH = "data/idf_tokenization/idf_dataset.json"

In [19]:
df_idf = spark.read.option("encoding", "utf-8") \
      .schema(vectorization_schema) \
      .json(IDF_DATASET_PATH)

In [20]:
CV_DATASET_PATH = "data/bow_tokenization/bow_dataset.json"

In [21]:
df_cv = spark.read.option("encoding", "utf-8") \
      .schema(vectorization_schema) \
      .json(CV_DATASET_PATH)

In [22]:
df_cv.show(5, truncate=False)

+---------+-----------+------------------------------------------------------------------------------------------------+
|review_id|recommended|review                                                                                          |
+---------+-----------+------------------------------------------------------------------------------------------------+
|84222416 |1          |(10817,[3,4,6,184,1684],[1.0,1.0,1.0,1.0,1.0])                                                  |
|83056151 |1          |(10817,[6,7,60,64,84,168,184,622,684,1150],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])           |
|82605667 |0          |(10817,[0,2,5,15,108,143,184,331,399,545],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])            |
|82535616 |1          |(10817,[0,3,4,60,79,184,461],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                     |
|82520252 |1          |(10817,[2,3,4,184,279,303,338,542,1399,2109,4320],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+---------+-----------+---------

#Bayes


##CV

In [23]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
nb_cv = NaiveBayes(featuresCol=REVIEW, labelCol=RECOMMENDED)

In [25]:
nb_cv.explainParams()

"featuresCol: features column name. (default: features, current: review)\nlabelCol: label column name. (default: label, current: recommended)\nmodelType: The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian. (default: multinomial)\npredictionCol: prediction column name. (default: prediction)\nprobabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\nsmoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)\nthresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. T

In [26]:
nb_cv = nb_cv.fit(df_cv)

In [27]:
df_cv =  nb_cv.transform(df_cv)

In [28]:
df_cv.show(10)

+---------+-----------+--------------------+--------------------+--------------------+----------+
|review_id|recommended|              review|       rawPrediction|         probability|prediction|
+---------+-----------+--------------------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|[-34.814950613517...|[0.00160822813728...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|[-75.556151759296...|[1.11934630090208...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|[-63.226822426973...|[0.91917168712356...|       0.0|
| 82535616|          1|(10817,[0,3,4,60,...|[-45.416061843376...|[5.45054011321029...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|[-85.928282942309...|[0.00105579559937...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|[-18.232630293710...|[0.02238579947899...|       1.0|
| 80846756|          1|(10817,[0,2,24,18...|[-24.355759256283...|[0.00770342604232...|       1.0|
| 79226741|         

In [29]:
df_cv = df_cv.drop(REVIEW, "rawPrediction")

In [30]:
df_cv

DataFrame[review_id: int, recommended: int, probability: vector, prediction: double]

In [31]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="recommended", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(df_cv)
print("Accuracy = " + str(accuracy))

Accuracy = 0.8981249373307931


In [32]:
CV_NB_PATH = "models/nb_cv"

In [33]:
#nb_cv.write().overwrite().save(CV_NB_PATH)

In [34]:
#nb_cv = NaiveBayesModel.load(CV_NB_PATH)

In [35]:
TRAIN_NB_CV_PREDICTIONS = "data/train_nb_cv_predictions"

In [36]:
#df_cv.repartition(1).write.json(TRAIN_NB_CV_PREDICTIONS)

##IDF

In [37]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
nb_idf = NaiveBayes(featuresCol=REVIEW, labelCol=RECOMMENDED)

In [39]:
nb_idf.explainParams()

"featuresCol: features column name. (default: features, current: review)\nlabelCol: label column name. (default: label, current: recommended)\nmodelType: The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian. (default: multinomial)\npredictionCol: prediction column name. (default: prediction)\nprobabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\nsmoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)\nthresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. T

In [40]:
nb_idf = nb_idf.fit(df_idf)

In [41]:
df_idf =  nb_idf.transform(df_idf)

In [42]:
df_idf.show(10)

+---------+-----------+--------------------+--------------------+--------------------+----------+
|review_id|recommended|              review|       rawPrediction|         probability|prediction|
+---------+-----------+--------------------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|[-140.87302551102...|[4.84830403428446...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|[-329.66272352229...|[1.85483626353400...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|[-250.84480455973...|[0.94786551986272...|       0.0|
| 82535616|          1|(10817,[0,3,4,60,...|[-160.94216893492...|[1.86775151033598...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|[-458.59734782585...|[3.23693012508318...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|[-55.597902584703...|[7.20054840357554...|       1.0|
| 80846756|          1|(10817,[0,2,24,18...|[-74.556275163943...|[8.58305930722889...|       1.0|
| 79226741|         

In [43]:
df_idf = df_idf.drop(REVIEW, "rawPrediction")

In [44]:
df_idf

DataFrame[review_id: int, recommended: int, probability: vector, prediction: double]

In [45]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="recommended", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(df_idf)
print("Accuracy = " + str(accuracy))

Accuracy = 0.9165747518299409


In [46]:
IDF_NB_PATH = "models/nb_idf"

In [47]:
#nb_idf.write().overwrite().save(IDF_NB_PATH)

In [48]:
#nb_idf = NaiveBayesModel.load(IDF_NB_PATH)

In [49]:
TRAIN_NB_IDF_PREDICTIONS = "data/train_nb_idf_predictions"

In [50]:
#df_idf.repartition(1).write.json(TRAIN_NB_IDF_PREDICTIONS)

#BernoulliBayes


##CV

In [None]:
# from pyspark.ml.classification import NaiveBayes
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# bnb_cv = NaiveBayes(featuresCol=REVIEW, labelCol=RECOMMENDED,modelType="bernoulli")

In [None]:
# bnb_cv.explainParams()

"featuresCol: features column name. (default: features, current: review)\nlabelCol: label column name. (default: label, current: recommended)\nmodelType: The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian. (default: multinomial, current: bernoulli)\npredictionCol: prediction column name. (default: prediction)\nprobabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\nsmoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)\nthresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most o

In [None]:
# bnb_cv = bnb_cv.fit(df_cv)

In [None]:
# df_cv =  bnb_cv.transform(df_cv)

In [None]:
# df_cv.show(10)

+---------+-----------+--------------------+--------------------+--------------------+----------+
|review_id|recommended|              review|       rawPrediction|         probability|prediction|
+---------+-----------+--------------------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|[-38.196563931617...|[2.19138371708132...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|[-64.718098843449...|[1.20814906944990...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|[-51.157825244124...|[0.74056281732409...|       0.0|
| 82535616|          1|(10817,[0,3,4,60,...|[-42.341151942956...|[2.08078352386915...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|[-71.972124132361...|[1.88439005095278...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|[-26.804606771957...|[1.98261206652067...|       1.0|
| 80846756|          1|(10817,[0,2,24,18...|[-29.912080457617...|[1.09352703702978...|       1.0|
| 79226741|         

In [None]:
# df_cv = df_cv.drop(REVIEW, "rawPrediction")

In [None]:
# df_cv

DataFrame[review_id: int, recommended: int, probability: vector, prediction: double]

In [None]:
# # compute accuracy on the test set
# evaluator = MulticlassClassificationEvaluator(labelCol="recommended", predictionCol="prediction",
#                                               metricName="accuracy")
# accuracy = evaluator.evaluate(df_cv)
# print("Accuracy = " + str(accuracy))

Accuracy = 0.786323072295197


In [None]:
# CV_BNB_PATH = "models/bnb_cv"

In [None]:
# bnb_cv.save(CV_BNB_PATH)

In [None]:
# bnb_cv = NaiveBayes.load(CV_BNB_PATH)

In [None]:
# TRAIN_BNB_CV_PREDICTIONS = "data/train_bnb_cv_predictions"

In [None]:
# df_cv.repartition(1).write.json(TRAIN_BNB_CV_PREDICTIONS)

##IDF

In [None]:
# from pyspark.ml.classification import NaiveBayes
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.feature import Binarizer

In [None]:
# bnb_idf = NaiveBayes(featuresCol="binary_features", labelCol=RECOMMENDED,modelType="bernoulli")

In [None]:
# bnb_idf.explainParams()

"featuresCol: features column name. (default: features, current: binary_features)\nlabelCol: label column name. (default: label, current: recommended)\nmodelType: The model type which is a string (case-sensitive). Supported options: multinomial (default), bernoulli and gaussian. (default: multinomial, current: bernoulli)\npredictionCol: prediction column name. (default: prediction)\nprobabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\nsmoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)\nthresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that 

In [None]:
# binarizer = Binarizer(threshold=0.0, inputCol=REVIEW, outputCol="binary_features")

In [None]:
# df_idf = binarizer.transform(df_idf)

In [None]:
# bnb_idf = bnb_idf.fit(df_idf)

In [None]:
# df_idf =  bnb_idf.transform(df_idf)

In [None]:
# df_idf.show(10)

+---------+-----------+--------------------+--------------------+--------------------+--------------------+----------+
|review_id|recommended|              review|     binary_features|       rawPrediction|         probability|prediction|
+---------+-----------+--------------------+--------------------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|(10817,[3,4,6,184...|[-38.196563931617...|[2.19138371708132...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|(10817,[6,7,60,64...|[-64.718098843449...|[1.20814906944990...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|(10817,[0,2,5,15,...|[-51.157825244124...|[0.74056281732409...|       0.0|
| 82535616|          1|(10817,[0,3,4,60,...|(10817,[0,3,4,60,...|[-42.341151942956...|[2.08078352386915...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|(10817,[2,3,4,184...|[-71.972124132361...|[1.88439005095278...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|(108

In [None]:
# df_idf = df_idf.drop(REVIEW, "rawPrediction")

In [None]:
# df_idf

DataFrame[review_id: int, recommended: int, binary_features: vector, probability: vector, prediction: double]

In [None]:
# # compute accuracy on the test set
# evaluator = MulticlassClassificationEvaluator(labelCol="recommended", predictionCol="prediction",
#                                               metricName="accuracy")
# accuracy = evaluator.evaluate(df_idf)
# print("Accuracy = " + str(accuracy))

Accuracy = 0.786323072295197


In [None]:
# IDF_BNB_PATH = "models/bnb_idf"

In [None]:
# bnb_idf.save(IDF_BNB_PATH)

In [None]:
# bnb_idf = NaiveBayes.load(IDF_BNB_PATH)

In [None]:
# TRAIN_BNB_IDF_PREDICTIONS = "data/train_bnb_idf_predictions"

In [None]:
# df_cv.repartition(1).write.json(TRAIN_BNB_IDF_PREDICTIONS)