## Install packages

In [1]:
!pip install findspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=224cd5598efca2bc8e0ec576f5f77ffc1c17fb0cfe20cab462ad40b1c0e0442a
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e407

# Imports and Env

In [7]:
from sklearn.decomposition import SparsePCA
import seaborn as sns
import pandas as pd

In [8]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [74]:
from pyspark.ml.classification import LinearSVC, LinearSVCModel

In [9]:
import findspark
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.column import _to_java_column, _to_seq, Column
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDFModel, IDF, PCA, VectorAssembler
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, BooleanType, ArrayType
from pyspark.ml.feature import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import pyspark.sql.functions as sqlf

In [10]:
from google.colab import drive

In [11]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [12]:
import os

In [13]:
os.getcwd()

'/content'

In [14]:
os.chdir('drive/MyDrive/SteamReviews2021Project')

In [15]:
os.listdir()

['data',
 'BDA_Project_Preprocessing.ipynb',
 'models',
 'Preprocessing+tokenization.ipynb',
 'Plotting_vectorization.ipynb',
 'Metrics&Results_pipeline.ipynb',
 'SVM_pipelines.ipynb']

# Constants

In [16]:
non_null_schema = StructType() \
    .add("#", IntegerType(), True) \
    .add("app_id", IntegerType(), True) \
    .add("app_name", StringType(), True) \
    .add("review_id", IntegerType(), True) \
    .add("language", StringType(), True) \
    .add("review", StringType(), True) \
    .add("timestamp_created", IntegerType(), True) \
    .add("timestamp_updated", IntegerType(), True) \
    .add("recommended", BooleanType(), True) \
    .add("votes_helpful", IntegerType(), True) \
    .add("votes_funny", IntegerType(), True) \
    .add("weighted_vote_score", FloatType(), True) \
    .add("comment_count", IntegerType(), True) \
    .add("steam_purchase", BooleanType(), True) \
    .add("received_for_free", BooleanType(), True) \
    .add("written_during_early_access", BooleanType(), True) \
    .add("author_steamid", IntegerType(), True) \
    .add("author_num_games_owned", IntegerType(), True) \
    .add("author_num_reviews", IntegerType(), True) \
    .add("author_playtime_forever", FloatType(), True) \
    .add("author_playtime_last_two_weeks", FloatType(), True) \
    .add("author_playtime_at_review", FloatType(), True) \
    .add("author_last_played", IntegerType(), True)

In [17]:
INDEX = "#"
APP_ID = "app_id"
APP_NAME = "app_name"
REVIEW_ID = "review_id"
LANGUAGE = "language"
REVIEW = "review"
TIMESTAMP_CREATED = "timestamp_created"
TIMESTAMP_UPDATED = "timestamp_updated"
RECOMMENDED = "recommended"
VOTES_HELPFUL = "votes_helpful"
VOTES_FUNNY = "votes_funny"
WEIGHTED_VOTE_SCORE = "weighted_vote_score"
COMMENT_COUNT = "comment_count"
STEAM_PURCHASE = "steam_purchase"
RECEIVED_FOR_FREE = "received_for_free"
WRITTEN_DURING_EARLY_ACCESS = "written_during_early_access"
AUTHOR_STEAMID = "author_steamid"
AUTHOR_NUM_GAMES_OWNED = "author_num_games_owned"
AUTHOR_NUM_REVIEWS = "author_num_reviews"
AUTHOR_PLAYTIME_FOREVER = "author_playtime_forever"
AUTHOR_PLAYTIME_LAST_TWO_WEEKS = "author_playtime_last_two_weeks"
AUTHOR_PLAYTIME_AT_REVIEW = "author_playtime_at_review"
AUTHOR_LAST_PLAYED = "author_last_played"

In [18]:
AUX_COL = "aux"

In [19]:
vectorization_schema = StructType() \
    .add("review_id", IntegerType(), True) \
    .add("recommended", IntegerType(), True) \
    .add("review", VectorUDT(), True)

# Load Vectorization

In [20]:
findspark.init()
spark = SparkSession.builder.master("local") \
                    .appName("Plotting vectorizations") \
                    .config("spark.executor.cores", "8") \
                    .config("spark.executor.memory", "8g") \
                    .getOrCreate()

In [21]:
IDF_DATASET_PATH = "data/idf_tokenization/idf_dataset.json"

In [22]:
df_idf = spark.read.option("encoding", "utf-8") \
      .schema(vectorization_schema) \
      .json(IDF_DATASET_PATH)

In [23]:
df_idf.show(5, truncate=False)

+---------+-----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|review_id|recommended|review                                                                                                                                                                                                                                                     |
+---------+-----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|84222416 |1          |(10817,[3,4,6,184,1684],[1.8212661372761738,1.8786432498594763,2.04301660733421,4.454146794999703,6.905151893112022])                                

In [49]:
CV_DATASET_PATH = "data/bow_tokenization/bow_dataset.json"

In [50]:
df_cv = spark.read.option("encoding", "utf-8") \
      .schema(vectorization_schema) \
      .json(CV_DATASET_PATH)

In [51]:
df_cv.show(5, truncate=False)

+---------+-----------+------------------------------------------------------------------------------------------------+
|review_id|recommended|review                                                                                          |
+---------+-----------+------------------------------------------------------------------------------------------------+
|84222416 |1          |(10817,[3,4,6,184,1684],[1.0,1.0,1.0,1.0,1.0])                                                  |
|83056151 |1          |(10817,[6,7,60,64,84,168,184,622,684,1150],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])           |
|82605667 |0          |(10817,[0,2,5,15,108,143,184,331,399,545],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])            |
|82535616 |1          |(10817,[0,3,4,60,79,184,461],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                     |
|82520252 |1          |(10817,[2,3,4,184,279,303,338,542,1399,2109,4320],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+---------+-----------+---------

# Train SVM

## CV

In [52]:
svm_cv = LinearSVC(featuresCol=REVIEW, labelCol=RECOMMENDED)

In [53]:
svm_cv.explainParams()

'aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nfeaturesCol: features column name. (default: features, current: review)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: recommended)\nmaxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)\nmaxIter: max number of iterations (>= 0). (default: 100)\npredictionCol: prediction column name. (default: prediction)\nrawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\nregParam: regularization parameter (>= 0). (default: 0.0)\nstandardization: whether to standardize the training features before fitting the model. (default: True)\nthreshold: The threshold in binary cl

In [54]:
svm_cv = svm_cv.fit(df_cv)

In [55]:
df_cv =  svm_cv.transform(df_cv)

In [56]:
df_cv.show(10)

+---------+-----------+--------------------+--------------------+----------+
|review_id|recommended|              review|       rawPrediction|prediction|
+---------+-----------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|[-6.9831860151405...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|[-9.3474968369114...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|[1.00139573135693...|       0.0|
| 82535616|          1|(10817,[0,3,4,60,...|[-4.9766560679726...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|[-3.2494828024150...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|[-4.5323988523509...|       1.0|
| 80846756|          1|(10817,[0,2,24,18...|[-4.2683540895946...|       1.0|
| 79226741|          1|(10817,[2,3,4,6,1...|[-5.0233350282669...|       1.0|
| 77340930|          1|(10817,[0,2,16,24...|[-3.2757248103219...|       1.0|
| 76875452|          0|(10817,[0,2,5,65,...|[1.13450307746905...|       0.0|

In [58]:
df_cv = df_cv.drop(REVIEW, "rawPrediction")

In [59]:
df_cv

DataFrame[review_id: int, recommended: int, prediction: double]

In [66]:
CV_SVM_PATH = "models/svm_cv"

In [67]:
# svm_cv.save(CV_SVM_PATH)

In [75]:
# svm_cv = LinearSVCModel.load(CV_SVM_PATH)

In [80]:
TRAIN_CV_SVM_PREDICTIONS = "data/train_cv_svm_predictions"

In [81]:
# df_cv.repartition(1).write.json(TRAIN_CV_SVM_PREDICTIONS)

## tfidf

In [60]:
svm_idf = LinearSVC(featuresCol=REVIEW, labelCol=RECOMMENDED)

In [61]:
svm_idf = svm_idf.fit(df_idf)

In [62]:
df_idf =  svm_cv.transform(df_idf)

In [63]:
df_idf.show(10)

+---------+-----------+--------------------+--------------------+----------+
|review_id|recommended|              review|       rawPrediction|prediction|
+---------+-----------+--------------------+--------------------+----------+
| 84222416|          1|(10817,[3,4,6,184...|[-29.157142081362...|       1.0|
| 83056151|          1|(10817,[6,7,60,64...|[-30.821786210642...|       1.0|
| 82605667|          0|(10817,[0,2,5,15,...|[-0.3788002817133...|       1.0|
| 82535616|          1|(10817,[0,3,4,60,...|[-19.476709184987...|       1.0|
| 82520252|          1|(10817,[2,3,4,184...|[-14.705507081101...|       1.0|
| 81316845|          1|(10817,[0,6,184],...|[-15.328209192841...|       1.0|
| 80846756|          1|(10817,[0,2,24,18...|[-16.346545907778...|       1.0|
| 79226741|          1|(10817,[2,3,4,6,1...|[-16.110464746819...|       1.0|
| 77340930|          1|(10817,[0,2,16,24...|[-21.784499460522...|       1.0|
| 76875452|          0|(10817,[0,2,5,65,...|[19.1417016758451...|       0.0|

In [64]:
df_idf = df_idf.drop(REVIEW, "rawPrediction")

In [65]:
df_idf

DataFrame[review_id: int, recommended: int, prediction: double]

In [76]:
IDF_SVM_PATH = "models/svm_idf"

In [77]:
# svm_idf.save(IDF_SVM_PATH)

In [78]:
# svm_idf = LinearSVCModel.load(IDF_SVM_PATH)

In [79]:
TRAIN_IDF_SVM_PREDICTIONS = "data/train_idf_svm_predictions"

In [82]:
# df_idf.repartition(1).write.json(TRAIN_IDF_SVM_PREDICTIONS)