## Install packages

In [2]:
!pip install findspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl siz

# Imports and Env

In [3]:
from sklearn.decomposition import SparsePCA
import seaborn as sns
import pandas as pd

In [4]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [5]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [6]:
from pyspark.ml.classification import LinearSVC, LinearSVCModel

In [85]:
import findspark
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.column import _to_java_column, _to_seq, Column
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDFModel, IDF, PCA, VectorAssembler
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, BooleanType, ArrayType
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import pyspark.sql.functions as sqlf

In [8]:
from google.colab import drive

In [9]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [10]:
import os

In [11]:
os.getcwd()

'/content'

In [12]:
os.chdir('drive/MyDrive/SteamReviews2021Project')

In [13]:
os.listdir()

['data',
 'BDA_Project_Preprocessing.ipynb',
 'models',
 'sanitize_original_dataset.ipynb',
 'Plotting_vectorization.ipynb',
 'SVM_pipelines.ipynb',
 'Preprocessing+tokenization.ipynb',
 'Metrics&Results_pipeline.ipynb',
 'BayesBDA.ipynb',
 'Model_test_pipeline.ipynb']

# Constants

In [14]:
non_null_schema = StructType() \
    .add("#", IntegerType(), True) \
    .add("app_id", IntegerType(), True) \
    .add("app_name", StringType(), True) \
    .add("review_id", IntegerType(), True) \
    .add("language", StringType(), True) \
    .add("review", StringType(), True) \
    .add("timestamp_created", IntegerType(), True) \
    .add("timestamp_updated", IntegerType(), True) \
    .add("recommended", BooleanType(), True) \
    .add("votes_helpful", IntegerType(), True) \
    .add("votes_funny", IntegerType(), True) \
    .add("weighted_vote_score", FloatType(), True) \
    .add("comment_count", IntegerType(), True) \
    .add("steam_purchase", BooleanType(), True) \
    .add("received_for_free", BooleanType(), True) \
    .add("written_during_early_access", BooleanType(), True) \
    .add("author_steamid", IntegerType(), True) \
    .add("author_num_games_owned", IntegerType(), True) \
    .add("author_num_reviews", IntegerType(), True) \
    .add("author_playtime_forever", FloatType(), True) \
    .add("author_playtime_last_two_weeks", FloatType(), True) \
    .add("author_playtime_at_review", FloatType(), True) \
    .add("author_last_played", IntegerType(), True)

In [15]:
INDEX = "#"
APP_ID = "app_id"
APP_NAME = "app_name"
REVIEW_ID = "review_id"
LANGUAGE = "language"
REVIEW = "review"
TIMESTAMP_CREATED = "timestamp_created"
TIMESTAMP_UPDATED = "timestamp_updated"
RECOMMENDED = "recommended"
VOTES_HELPFUL = "votes_helpful"
VOTES_FUNNY = "votes_funny"
WEIGHTED_VOTE_SCORE = "weighted_vote_score"
COMMENT_COUNT = "comment_count"
STEAM_PURCHASE = "steam_purchase"
RECEIVED_FOR_FREE = "received_for_free"
WRITTEN_DURING_EARLY_ACCESS = "written_during_early_access"
AUTHOR_STEAMID = "author_steamid"
AUTHOR_NUM_GAMES_OWNED = "author_num_games_owned"
AUTHOR_NUM_REVIEWS = "author_num_reviews"
AUTHOR_PLAYTIME_FOREVER = "author_playtime_forever"
AUTHOR_PLAYTIME_LAST_TWO_WEEKS = "author_playtime_last_two_weeks"
AUTHOR_PLAYTIME_AT_REVIEW = "author_playtime_at_review"
AUTHOR_LAST_PLAYED = "author_last_played"

In [16]:
AUX_COL = "aux"

In [17]:
vectorization_schema = StructType() \
    .add("review_id", IntegerType(), True) \
    .add("recommended", IntegerType(), True) \
    .add("review", VectorUDT(), True)

In [103]:
PRESENTATION_ROW_COUNT = 10

In [110]:
RAW_PREDICTION = "rawPrediction"

In [301]:
PREDICTION = "prediction"
LABEL = "label"

# Load Data

In [241]:
TEST_DATASET_PATH = "data/final_dataset_50k_en_recommended_25000per.csv"

In [242]:
findspark.init()
spark = SparkSession.builder.master("local") \
                    .appName("Preprocessing reviews") \
                    .config("spark.executor.cores", "8") \
                    .config("spark.executor.memory", "16g") \
                    .getOrCreate()

In [243]:
df = spark.read.format("csv") \
    .option("header", True) \
    .option("encoding", "utf-8") \
    .schema(non_null_schema) \
    .load(TEST_DATASET_PATH)

In [244]:
df.show(PRESENTATION_ROW_COUNT, truncate=False)

+-----+------+------------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------+-----------+-------------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+--------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+
|#    |app_id|app_name                |review_id|language|review                                                                                                                                                                                                                             |timestamp_created|timestamp_updated|recommended|votes_helpful|votes_funny|weighted_vote_score|comment

# Preprocessing

Lowercase

In [245]:
df = df.withColumn("review", sqlf.lower(sqlf.col("review")))

Remove Urls

In [246]:
url_removal = re.compile(r'https?://\S+')
url_remover_udf = sqlf.udf(lambda element: url_removal.sub('', element))
df = df.withColumn("review", url_remover_udf("review"))

Remove Unnecessary Spaces

In [247]:
spaces_remover_udf = sqlf.udf(lambda element: element.replace("  ", " ")
                                            .replace("\n", " ")
                                            .replace("\t", " ")
                                            .replace("\r\n", " "))
df = df.withColumn("review", spaces_remover_udf("review"))

Remove Steam Mentions

In [248]:
mentions_removal = re.compile(r'(@[A-Za-z0-9]+)')
mentions_remover_udf = sqlf.udf(lambda element: mentions_removal.sub(' ', element))
df = df.withColumn("review", mentions_remover_udf("review"))

Remove Emails

In [249]:
mails_removal = re.compile(r'^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$')
mails_remover_udf = sqlf.udf(lambda element: mails_removal.sub('', element))
df = df.withColumn("review", mails_remover_udf("review"))

Remove non essential characters

In [250]:
non_essential_symbols_removal = re.compile(r'[.,;:?!#-$%^&*()_+={}\[\]()//]')
non_essential_symbols_remover_udf = sqlf.udf(lambda element: non_essential_symbols_removal.sub('', element))
df = df.withColumn("review", non_essential_symbols_remover_udf("review"))

Remove digits

In [251]:
digits_removal = re.compile(r'[0123456789]')
digits_remover_udf = sqlf.udf(lambda element: digits_removal.sub('', element))
df = df.withColumn("review", digits_remover_udf("review"))

Remove unicode

In [252]:
unicode_remover_udf = sqlf.udf(lambda element: (element.encode("ascii", "ignore")).decode())
df = df.withColumn("review", unicode_remover_udf("review"))

Replace Abbreviations

In [253]:
abbreviations_removal = {
    'dm': 'direct message',
    'pm': 'private message',
    'thx': 'thanks',
    'cuz': 'because',
    'dming': 'direct messaging',
    'dmed': 'direct messaged',
    'plz': 'please',
    'u': 'you',
    'youre': 'you are',
    'asap': 'as soon as possible',
    'r': 'are',
    'gg': 'good game',
    'gut': 'good',
    'gud': 'good',
    'gl': 'good luck',
    'hf': 'have fun',
    'og': 'original gangster',
    'nt': 'nice try',
    'ofc': 'of course',
    'wp': 'well played',
    'ez': 'easy',
    'bb': 'bye',
    'btw': 'by the way',
    'aka': 'also known as',
    'eg': 'for example',
    'fps': 'first person shooter',
    'ie': 'that is',
    'lol': 'laughing out loud', # or league of legends?
    'lvl': 'level',
    'pr': 'power rank',
    'xp': 'experience points',
    'inv': 'invite',
    'lfm': 'looking for member', 
    'wtb': 'want to buy', 
    'wts': 'want to sell', 
    'wtt': 'want to trade',
}

abbreviations_remover_udf = sqlf.udf(lambda element: " ".join([abbreviations_removal.get(word, word) for word in element.split()]))
df = df.withColumn("review", abbreviations_remover_udf("review"))

Remove Stop Words

In [254]:
df = df.withColumn(REVIEW, sqlf.split(sqlf.col(REVIEW), "\s+"))
stop_words_remover = StopWordsRemover(
    inputCol=REVIEW, outputCol=AUX_COL, locale="en_US")
df = stop_words_remover.transform(df)

In [255]:
df = df.drop(REVIEW)
df = df.withColumnRenamed(AUX_COL, REVIEW)

Stemming

In [256]:
stemmer = PorterStemmer()


def stem(input_vector):
    output_vector = []
    for element in input_vector:
        output_vector.append(stemmer.stem(element))
    return output_vector


stemmer_udf = sqlf.udf(lambda element: stem(element), ArrayType(StringType()))
df = df.withColumn(REVIEW, stemmer_udf(REVIEW))

In [257]:
df.show(PRESENTATION_ROW_COUNT, truncate=False)

+-----+------+------------------------+---------+--------+-----------------+-----------------+-----------+-------------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+--------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+-------------------------------------------------------------------------------------------------------------------------+
|#    |app_id|app_name                |review_id|language|timestamp_created|timestamp_updated|recommended|votes_helpful|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played|review                                                                                                       

# Data Augmentation

In [258]:
def embedded_with_threshold(df, col, threshold, tag):
  df = df.withColumn(
            AUX_COL,
            sqlf.when(sqlf.col(col) >= threshold, 
                  sqlf.array_union(df.review, sqlf.array(sqlf.lit(tag))))
            .otherwise(sqlf.col(REVIEW))
        )
  df = df.drop(REVIEW)
  df = df.withColumnRenamed(AUX_COL, REVIEW)
  return df
def embedded_with_boolean(df, col, value, tag):
  df = df.withColumn(
            AUX_COL,
            sqlf.when(sqlf.col(col) == value, 
                  sqlf.array_union(df.review, sqlf.array(sqlf.lit(tag))))
            .otherwise(sqlf.col(REVIEW))
        )
  df = df.drop(REVIEW)
  df = df.withColumnRenamed(AUX_COL, REVIEW)
  return df

In [259]:
df = df.withColumn(
    AUX_COL, 
    sqlf.array_union(df.review, sqlf.array(sqlf.lit(df.app_id.cast(StringType()))))
)

In [260]:
df = df.drop(REVIEW)
df = df.withColumnRenamed(AUX_COL, REVIEW)

In [261]:
df = embedded_with_threshold(df, VOTES_FUNNY, 2.0, "[FUNNY]")

In [262]:
df = embedded_with_threshold(df, VOTES_HELPFUL, 3.0, "[HELPFUL]")

In [263]:
df = embedded_with_threshold(df, COMMENT_COUNT, 1.0, "[WITH_COMMENTS]")

In [264]:
df = embedded_with_threshold(df, AUTHOR_PLAYTIME_FOREVER, 10.0, "[LONG_PLAYTIME]")

In [265]:
df = embedded_with_threshold(df, AUTHOR_PLAYTIME_FOREVER, 2.0, "[BIGGER_COLLECTION]")

In [266]:
df = embedded_with_boolean(df, WRITTEN_DURING_EARLY_ACCESS, True, "[EA]")

In [267]:
df = embedded_with_boolean(df, RECEIVED_FOR_FREE, True, "[FREE]")

In [268]:
df = embedded_with_boolean(df, STEAM_PURCHASE, False, "[BOUGHT_ELSEWHERE]")

In [269]:
df.select(REVIEW, RECOMMENDED).show(PRESENTATION_ROW_COUNT, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|review                                                                                                                                              |recommended|
+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|[, 292030, [LONG_PLAYTIME], [BIGGER_COLLECTION]]                                                                                                    |false      |
|[best, game, niggybytejakecom, 292030, [LONG_PLAYTIME], [BIGGER_COLLECTION], [FREE]]                                                                |true       |
|[best, game, 292030, [LONG_PLAYTIME], [BIGGER_COLLECTION]]                                                                                          |true       |
|[start, found, bit, b

# Tokenization

 Load TFIDF and BOW Tokenizers

In [270]:
CV_SAVE_PATH = "models/cv_model"

In [271]:
TFIDF_SAVE_PATH = "models/idf_model"

In [272]:
cv = CountVectorizerModel.load(CV_SAVE_PATH)

In [273]:
idf = IDFModel.load(TFIDF_SAVE_PATH)

Transform to BOW vectors

In [274]:
df = cv.transform(df)
df = df.drop(REVIEW)
df = df.withColumnRenamed(AUX_COL, REVIEW)

In [275]:
df.select(REVIEW, RECOMMENDED).show(PRESENTATION_ROW_COUNT, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|review                                                                                                                                        |recommended|
+----------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|(10817,[3,4,97,184],[1.0,1.0,1.0,1.0])                                                                                                        |false      |
|(10817,[0,3,4,24,62,184],[1.0,1.0,1.0,1.0,1.0,1.0])                                                                                           |true       |
|(10817,[0,3,4,24,184],[1.0,1.0,1.0,1.0,1.0])                                                                                                  |true       |
|(10817,[0,2,3,4,11,13,58,63,79,122,143,184,200,206,237,26

Transform to TFIDF vectors

In [276]:
df = idf.transform(df)
df = df.drop(REVIEW)
df = df.withColumnRenamed(AUX_COL, REVIEW)

In [277]:
df.select(REVIEW, RECOMMENDED).show(PRESENTATION_ROW_COUNT, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|review                                                                                                                                                                                                                                                                                                                                                                                                         |recommended|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# ML Model Prediction Phase

Load ML models

In [278]:
IDF_SVM_PATH = "models/svm_idf"

In [279]:
svm_idf = LinearSVCModel.load(IDF_SVM_PATH)

Select relevant columns from dataset

In [280]:
df = df.select(REVIEW_ID, REVIEW, RECOMMENDED)

In [281]:
df = df.withColumn(RECOMMENDED,df.recommended.cast('integer'))

In [282]:
df.select(RECOMMENDED, REVIEW).show(PRESENTATION_ROW_COUNT, truncate=False)

+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|recommended|review                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------

In [283]:
df.printSchema()

root
 |-- review_id: integer (nullable = true)
 |-- review: vector (nullable = true)
 |-- recommended: integer (nullable = true)



Predict SVM

In [293]:
df_svm_results = svm_idf.transform(df)

In [296]:
df_svm_results = df_svm_results.select(REVIEW_ID, RECOMMENDED, PREDICTION)

Predict NaiveBayes

# Analyzing Results

In [297]:
df_svm_results.show(PRESENTATION_ROW_COUNT, truncate=False)

+---------+-----------+----------+
|review_id|recommended|Prediction|
+---------+-----------+----------+
|84794967 |0          |1.0       |
|84781532 |1          |1.0       |
|84518667 |1          |1.0       |
|84441156 |1          |1.0       |
|84168972 |0          |1.0       |
|83943596 |1          |1.0       |
|83918031 |1          |1.0       |
|83868393 |1          |1.0       |
|83727382 |0          |0.0       |
|83469758 |0          |0.0       |
+---------+-----------+----------+
only showing top 10 rows



In [307]:
def get_confusion_matrix(df):
  df = df.drop(REVIEW_ID)
  df = df.withColumnRenamed(RECOMMENDED, LABEL)
  df = df.withColumn(LABEL, df.label.cast("double"))
  metrics = MulticlassMetrics(df.rdd)
  return metrics.confusionMatrix().toArray()

In [315]:
def print_metrics(cm):
  accuracy=(cm[0][0]+cm[1][1])/cm.sum()
  precision=(cm[0][0])/(cm[0][0]+cm[1][0])
  recall=(cm[0][0])/(cm[0][0]+cm[0][1])
  f1=2* (precision * recall) / (precision+recall)
  print(
      f"Accuracy: {accuracy}\n"
      f"Precision: {precision}\n"
      f"Recall: {recall}\n"
      f"F-1: {f1}\n"
  )

SVM

In [308]:
svm_confusion_matrix = get_confusion_matrix(df_svm_results)



In [310]:
svm_confusion_matrix

array([[20590.,  4283.],
       [ 4650., 20817.]])

In [316]:
print_metrics(svm_confusion_matrix)

Accuracy: 0.8225466825586015
Precision: 0.8157686212361331
Recall: 0.827805250673421
F-1: 0.8217428611338375



NaiveBayes