# Imports & Env

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
import collections
collections.Callable = collections.abc.Callable

In [3]:
from pyreadline import Readline
readline = Readline()
import rlcompleter
readline.parse_and_bind("tab: complete")

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains

In [5]:
import os

In [6]:
os.chdir("..")

In [7]:
os.getcwd()

'E:\\coding\\repos\\BigDataProject'

In [8]:
spark = SparkSession.builder.master("local").appName('SamplingReviews')\
.config('spark.executor.cores', '8').config('spark.executor.memory','16g').getOrCreate()

# Sampling

## Init Dataset

In [21]:
DATASET_PATH = "data/steam_reviews.csv"

In [13]:
NON_NULL_DATASET_PATH = "data/steam_reviews_non_null"

In [10]:
schema = StructType() \
      .add("#",IntegerType(),True) \
      .add("app_id",IntegerType(),True) \
      .add("app_name",StringType(),True) \
      .add("review_id",IntegerType(),True) \
      .add("language",StringType(),True) \
      .add("review",StringType(),True) \
      .add("timestamp_created",IntegerType(),True) \
      .add("timestamp_updated",IntegerType(),True) \
      .add("recommended",BooleanType(),True) \
      .add("votes_helpful",IntegerType(),True) \
      .add("votes_funny",IntegerType(),True) \
      .add("weighted_vote_score",FloatType(),True) \
      .add("comment_count",IntegerType(),True) \
      .add("steam_purchase",BooleanType(),True) \
      .add("received_for_free",BooleanType(),True) \
      .add("written_during_early_access",BooleanType(),True) \
      .add("author.steamid",IntegerType(),True) \
      .add("author.num_games_owned",IntegerType(),True) \
      .add("author.num_reviews",IntegerType(),True) \
      .add("author.playtime_forever",FloatType(),True) \
      .add("author.playtime_last_two_weeks",FloatType(),True) \
      .add("author.playtime_at_review",FloatType(),True) \
      .add("author.last_played",IntegerType(),True)

In [11]:
INDEX = "#"
APP_ID = "app_id"
APP_NAME = "app_name"
REVIEW_ID = "review_id"
LANGUAGE = "language"
REVIEW = "review"
TIMESTAMP_CREATED = "timestamp_created"
TIMESTAMP_UPDATED = "timestamp_updated"
RECOMMENDED = "recommended"
VOTES_HELPFUL = "votes_helpful"
VOTES_FUNNY = "votes_funny"
WEIGHTED_VOTE_SCORE = "weighted_vote_score"
COMMENT_COUNT = "comment_count"
STEAM_PURCHASE = "steam_purchase"
RECEIVED_FOR_FREE = "received_for_free"
WRITTEN_DURING_EARLY_ACCESS = "written_during_early_access"
AUTHOR_STEAMID = "author_steamid"
AUTHOR_NUM_GAMES_OWNED = "author_num_games_owned"
AUTHOR_NUM_REVIEWS = "author_num_reviews"
AUTHOR_PLAYTIME_FOREVER = "author_playtime_forever"
AUTHOR_PLAYTIME_LAST_TWO_WEEKS = "author_playtime_last_two_weeks"
AUTHOR_PLAYTIME_AT_REVIEW = "author_playtime_at_review"
AUTHOR_LAST_PLAYED = "author_last_played"

In [12]:
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .option("encoding", "utf-8") \
      .schema(schema) \
      .load(DATASET_PATH)

In [13]:
new_columns = [x.replace(".","_") for x in df_with_schema.columns]

In [14]:
df_with_schema = df_with_schema.toDF(*new_columns)

In [15]:
df_with_schema.printSchema()

root
 |-- #: integer (nullable = true)
 |-- app_id: integer (nullable = true)
 |-- app_name: string (nullable = true)
 |-- review_id: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: integer (nullable = true)
 |-- timestamp_updated: integer (nullable = true)
 |-- recommended: boolean (nullable = true)
 |-- votes_helpful: integer (nullable = true)
 |-- votes_funny: integer (nullable = true)
 |-- weighted_vote_score: float (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- steam_purchase: boolean (nullable = true)
 |-- received_for_free: boolean (nullable = true)
 |-- written_during_early_access: boolean (nullable = true)
 |-- author_steamid: integer (nullable = true)
 |-- author_num_games_owned: integer (nullable = true)
 |-- author_num_reviews: integer (nullable = true)
 |-- author_playtime_forever: float (nullable = true)
 |-- author_playtime_last_two_weeks: float (nullable = true)
 |-- au

In [17]:
df_with_schema.count()

40848659

In [17]:
df_non_null = df_with_schema.where(df_with_schema[INDEX].isNull()==False)

In [19]:
df_non_null.count()

21768122

In [25]:
df_non_null.repartition(1).write.options(header='True', delimiter=',').csv(NON_NULL_DATASET_PATH)

# load the non null df

In [None]:
# create the bellow mentioned samples of the dataset

In [None]:
# move as methods, the code for creating each dataset in the codebase

In [None]:
# move all the constants and variables defined into the codebase

In [30]:
non_null_schema = StructType() \
      .add("#",IntegerType(),True) \
      .add("app_id",IntegerType(),True) \
      .add("app_name",StringType(),True) \
      .add("review_id",IntegerType(),True) \
      .add("language",StringType(),True) \
      .add("review",StringType(),True) \
      .add("timestamp_created",IntegerType(),True) \
      .add("timestamp_updated",IntegerType(),True) \
      .add("recommended",BooleanType(),True) \
      .add("votes_helpful",IntegerType(),True) \
      .add("votes_funny",IntegerType(),True) \
      .add("weighted_vote_score",FloatType(),True) \
      .add("comment_count",IntegerType(),True) \
      .add("steam_purchase",BooleanType(),True) \
      .add("received_for_free",BooleanType(),True) \
      .add("written_during_early_access",BooleanType(),True) \
      .add("author_steamid",IntegerType(),True) \
      .add("author_num_games_owned",IntegerType(),True) \
      .add("author_num_reviews",IntegerType(),True) \
      .add("author_playtime_forever",FloatType(),True) \
      .add("author_playtime_last_two_weeks",FloatType(),True) \
      .add("author_playtime_at_review",FloatType(),True) \
      .add("author_last_played",IntegerType(),True)

In [31]:
df_non_null = spark.read.format("csv") \
      .option("header", True) \
      .option("encoding", "utf-8") \
      .schema(non_null_schema) \
      .load(NON_NULL_DATASET_PATH)

In [17]:
df_non_null.select(LANGUAGE).distinct().count()

713

In [25]:
df_non_null.where(df_non_null[REVIEW].isNull()==False).count()

21714875

In [26]:
df_non_null.where(df_non_null[RECOMMENDED].isNull()==False).count()

16672507

In [27]:
df_non_null.where(df_non_null[APP_ID].isNull()==False).count()

21747682

In [32]:
df_non_null.where(df_non_null[AUTHOR_STEAMID].isNull()==False).count()

18709

In [33]:
df_non_null.where(df_non_null[COMMENT_COUNT].isNull()==False).count()

16715506

In [34]:
df_non_null.where(df_non_null[VOTES_FUNNY].isNull()==False).count()

16729943

In [35]:
df_non_null.where(df_non_null[VOTES_HELPFUL].isNull()==False).count()

16705650

In [37]:
df_non_null.where(df_non_null[TIMESTAMP_CREATED].isNull()==False).count()

16675508

In [38]:
df_non_null.where(df_non_null[TIMESTAMP_UPDATED].isNull()==False).count()

16709690

In [39]:
df_non_null.where(df_non_null[LANGUAGE].isNull()==False).count()

21750747

In [40]:
df_non_null.where(df_non_null[STEAM_PURCHASE].isNull()==False).count()

16678562

In [41]:
df_non_null.where(df_non_null[RECEIVED_FOR_FREE].isNull()==False).count()

16713547

In [42]:
df_non_null.where(df_non_null[WRITTEN_DURING_EARLY_ACCESS].isNull()==False).count()

16732525

In [43]:
df_non_null.where(df_non_null[AUTHOR_NUM_GAMES_OWNED].isNull()==False).count()

16686141

In [44]:
df_non_null.where(df_non_null[AUTHOR_NUM_REVIEWS].isNull()==False).count()

16719280

In [45]:
df_non_null.where(df_non_null[AUTHOR_PLAYTIME_FOREVER].isNull()==False).count()

16752281

In [46]:
df_non_null.where(df_non_null[AUTHOR_PLAYTIME_LAST_TWO_WEEKS].isNull()==False).count()

16758126

In [47]:
df_non_null.where(df_non_null[AUTHOR_PLAYTIME_AT_REVIEW].isNull()==False).count()

16743208

In [48]:
df_non_null.where(df_non_null[AUTHOR_LAST_PLAYED].isNull()==False).count()

17940

# Non null final

In [58]:
df_non_null_final = df_non_null.drop(AUTHOR_LAST_PLAYED, AUTHOR_STEAMID)

In [62]:
df_non_null_final = df_non_null_final.dropna(how="any")

In [64]:
df_non_null_final.repartition(1).write.options(header='True', delimiter=',').csv(FINAL_DATASET)

In [73]:
FINAL_DATASET = "data/steam_reviews_final.csv"

In [74]:
df_non_null_final = spark.read.format("csv") \
      .option("header", True) \
      .option("encoding", "utf-8") \
      .schema(non_null_schema) \
      .load(FINAL_DATASET)

In [75]:
df_non_null_final.count()

16618478

In [76]:
df_non_null_final.where(df_non_null_final[APP_ID].isNull()==False).count()

16618478

In [77]:
df_non_null_final.where(df_non_null_final[RECOMMENDED].isNull()==False).count()

16618478

In [78]:
df_non_null_final.where(df_non_null_final[REVIEW].isNull()==False).count()

16618478

# Sampling

In [18]:
df_english_dataset = df_non_null.where(df_non_null[LANGUAGE] == "english")

In [19]:
df_english_dataset.count()

9635437

In [22]:
df_english_dataset.groupBy(RECOMMENDED).count().show()

+-----------+-------+
|recommended|  count|
+-----------+-------+
|       null|2369027|
|       true|6596099|
|      false| 670311|
+-----------+-------+



In [20]:
df_non_english_dataset = df_non_null.where(df_non_null[LANGUAGE] != "english")

In [21]:
df_non_english_dataset.count()

12115310

In [54]:
df_non_english_dataset.groupBy(RECOMMENDED).count().show()

+-----------+--------+
|recommended|   count|
+-----------+--------+
|       null| 4810145|
|       true|10007504|
|      false| 1897153|
+-----------+--------+



In [None]:
# 100k, 50k, 10k

In [None]:
# small sample for english dataset balanced around RECOMMENDED

In [None]:
# small sample for english dataset balanced around APP_ID

In [None]:
# small sample for non-english dataset balanced around RECOMMENDED

In [None]:
# small sample for non-english dataset balanced around APP_ID

# Detect Encoding

In [36]:
from chardet import detect

In [37]:
type(x)

bytes

In [42]:
with open(DATASET_PATH, "rb") as infile:
    # for _ in range(30):
    x = infile.read(400000)

In [43]:
detect(x)

{'encoding': 'Windows-1254',
 'confidence': 0.3287225310697952,
 'language': 'Turkish'}

# Ending

In [None]:
spark.close()