## Install packages

In [2]:
!pip install findspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 54.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=6981d97d824eda7cf17c945c530dab6fa581eb0530b90a1d7ee45895a7545a23
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e407

# Imports and Env

In [3]:
from sklearn.decomposition import SparsePCA
import seaborn as sns
import pandas as pd

In [31]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [4]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [23]:
import findspark
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.column import _to_java_column, _to_seq, Column
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDFModel, IDF, PCA, VectorAssembler
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, BooleanType, ArrayType, DoubleType
from pyspark.ml.feature import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import pyspark.sql.functions as sqlf

In [6]:
from google.colab import drive

In [7]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [8]:
import os

In [9]:
os.getcwd()

'/content'

In [10]:
os.chdir('drive/MyDrive/SteamReviews2021Project')

In [11]:
os.listdir()

['data',
 'BDA_Project_Preprocessing.ipynb',
 'models',
 'Plotting_vectorization.ipynb',
 'SVM_pipelines.ipynb',
 'Preprocessing+tokenization.ipynb',
 'Metrics&Results_pipeline.ipynb']

# Constants

In [12]:
non_null_schema = StructType() \
    .add("#", IntegerType(), True) \
    .add("app_id", IntegerType(), True) \
    .add("app_name", StringType(), True) \
    .add("review_id", IntegerType(), True) \
    .add("language", StringType(), True) \
    .add("review", StringType(), True) \
    .add("timestamp_created", IntegerType(), True) \
    .add("timestamp_updated", IntegerType(), True) \
    .add("recommended", BooleanType(), True) \
    .add("votes_helpful", IntegerType(), True) \
    .add("votes_funny", IntegerType(), True) \
    .add("weighted_vote_score", FloatType(), True) \
    .add("comment_count", IntegerType(), True) \
    .add("steam_purchase", BooleanType(), True) \
    .add("received_for_free", BooleanType(), True) \
    .add("written_during_early_access", BooleanType(), True) \
    .add("author_steamid", IntegerType(), True) \
    .add("author_num_games_owned", IntegerType(), True) \
    .add("author_num_reviews", IntegerType(), True) \
    .add("author_playtime_forever", FloatType(), True) \
    .add("author_playtime_last_two_weeks", FloatType(), True) \
    .add("author_playtime_at_review", FloatType(), True) \
    .add("author_last_played", IntegerType(), True)

In [13]:
INDEX = "#"
APP_ID = "app_id"
APP_NAME = "app_name"
REVIEW_ID = "review_id"
LANGUAGE = "language"
REVIEW = "review"
TIMESTAMP_CREATED = "timestamp_created"
TIMESTAMP_UPDATED = "timestamp_updated"
RECOMMENDED = "recommended"
VOTES_HELPFUL = "votes_helpful"
VOTES_FUNNY = "votes_funny"
WEIGHTED_VOTE_SCORE = "weighted_vote_score"
COMMENT_COUNT = "comment_count"
STEAM_PURCHASE = "steam_purchase"
RECEIVED_FOR_FREE = "received_for_free"
WRITTEN_DURING_EARLY_ACCESS = "written_during_early_access"
AUTHOR_STEAMID = "author_steamid"
AUTHOR_NUM_GAMES_OWNED = "author_num_games_owned"
AUTHOR_NUM_REVIEWS = "author_num_reviews"
AUTHOR_PLAYTIME_FOREVER = "author_playtime_forever"
AUTHOR_PLAYTIME_LAST_TWO_WEEKS = "author_playtime_last_two_weeks"
AUTHOR_PLAYTIME_AT_REVIEW = "author_playtime_at_review"
AUTHOR_LAST_PLAYED = "author_last_played"

In [14]:
AUX_COL = "aux"

In [33]:
PREDICTION = "prediction"
LABEL = "label"

In [16]:
vectorization_schema = StructType() \
    .add("review_id", IntegerType(), True) \
    .add("recommended", IntegerType(), True) \
    .add("review", VectorUDT(), True)

In [27]:
results_schema = StructType() \
    .add("review_id", IntegerType(), True) \
    .add("recommended", IntegerType(), True) \
    .add("prediction", DoubleType(), True)

# Load Results

In [18]:
findspark.init()
spark = SparkSession.builder.master("local") \
                    .appName("Metrics") \
                    .config("spark.executor.cores", "8") \
                    .config("spark.executor.memory", "8g") \
                    .getOrCreate()

In [28]:
RESULTS_PATH = "data/train_idf_svm_predictions/part-00000-fe4c1b7f-34ed-4b26-93b7-da705481dfe5-c000.json"

In [29]:
df = spark.read.option("encoding", "utf-8") \
      .schema(results_schema) \
      .json(RESULTS_PATH)

In [30]:
df.show(5, truncate=False)

+---------+-----------+----------+
|review_id|recommended|prediction|
+---------+-----------+----------+
|84222416 |1          |1.0       |
|83056151 |1          |1.0       |
|82605667 |0          |1.0       |
|82535616 |1          |1.0       |
|82520252 |1          |1.0       |
+---------+-----------+----------+
only showing top 5 rows



# Metrics

In [32]:
df = df.drop(REVIEW_ID)

In [34]:
df = df.withColumnRenamed(RECOMMENDED, LABEL)

In [36]:
df = df.withColumn(LABEL, df.label.cast("double"))

DataFrame[label: double, prediction: double]

In [40]:
metrics = MulticlassMetrics(df.rdd)



In [41]:
cm=metrics.confusionMatrix().toArray()

In [42]:
accuracy=(cm[0][0]+cm[1][1])/cm.sum()

In [43]:
precision=(cm[0][0])/(cm[0][0]+cm[1][0])

In [44]:
recall=(cm[0][0])/(cm[0][0]+cm[0][1])

In [45]:
f1=2* (precision * recall) / (precision+recall)

# End Results

In [46]:
accuracy

0.9558808783716033

In [47]:
precision

0.9573450269085111

In [48]:
recall

0.9550606482402068

In [49]:
f1

0.9562014732231734