# Steam Review Dataset EDA (Exploratory Data Analysis)
This notebook performs exploratory analysis on a 47GB dataset of Steam game reviews using PySpark and Google Cloud Dataproc.

In [None]:
spark

## 1. Imports and Google Cloud Storage Setup

In [None]:
from pyspark.sql.functions import col, isnan, count, udf, length, when
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from google.cloud import storage
from io import StringIO, BytesIO
import gzip
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.width', 1000)

In [None]:
# Note spark is automatically initialized in GCP Dataproc Clusters
sc.setLogLevel("ERROR")

In [None]:
#path to files(REPLACE BUCKET VARIABLE WITH WHATEVER YOUR BUCKET NAME IS)
bucket = 'gs://whatever-your-bucket-name-is'
landing_folder = f"{bucket}/landing/all_reviews.csv"
cleaned_folder = f"{bucket}/cleaned/"

## 2. Define Schema & Loading Dataset from GCS

In [None]:
#preset schema for predictors since inferSchema was only giving me string
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType
steamSchema = StructType([
    StructField("recommendationid", LongType(), True),              # Large integer (148919893)
    StructField("appid", IntegerType(), True),                      # Small integer (10)
    StructField("game", StringType(), True),                        # Text ("Counter-Strike")
    StructField("author_steamid", LongType(), True),                # Large integer (76561199036724879)
    StructField("author_num_games_owned", IntegerType(), True),     # Small integer (0)
    StructField("author_num_reviews", IntegerType(), True),         # Small integer (3)
    StructField("author_playtime_forever", IntegerType(), True),    # Integer (197)
    StructField("author_playtime_last_two_weeks", IntegerType(), True),  # Integer (197)
    StructField("author_playtime_at_review", IntegerType(), True),  # Integer (197)
    StructField("author_last_played", LongType(), True),            # Unix timestamp (1698336369)
    StructField("language", StringType(), True),                    # Text ("russian")
    StructField("review", StringType(), True),                      # Text ("старость")
    StructField("timestamp_created", LongType(), True),             # Unix timestamp (1698336397)
    StructField("timestamp_updated", LongType(), True),             # Unix timestamp (1698336397)
    StructField("voted_up", IntegerType(), True),                   # Small integer (1 as true)
    StructField("votes_up", IntegerType(), True),                   # Small integer (0)
    StructField("votes_funny", IntegerType(), True),                # Small integer (0)
    StructField("weighted_vote_score", FloatType(), True),          # Float (0.0)
    StructField("comment_count", IntegerType(), True),              # Small integer (0)
    StructField("steam_purchase", IntegerType(), True),             # Small integer (1 as true)
    StructField("received_for_free", IntegerType(), True),          # Small integer (0 as false)
    StructField("written_during_early_access", IntegerType(), True),  # Small integer (0 as false)
    StructField("hidden_in_steam_china", IntegerType(), True),       # Small integer (1 as true)
    StructField("steam_china_location", StringType(), True)
])

In [None]:
sdf = spark.read.csv(landing_folder, header=True, schema=steamSchema) #loading in data in set schema to DataFrame

In [None]:
sdf.show(2, vertical=True)

In [None]:
sdf.printSchema() #check schema was implemented and list of variables

In [None]:
sdf.select('language').distinct().show() #show distinct languages to encode later

In [None]:
sdf.count() # number of records in steam review

In [None]:
sdf.summary().show(vertical=True) # show statistics for numerical variables, including min/max for unix date

In [None]:
sdf.select('review').summary('count', 'min', 'max').show() # quick summary for review column

In [None]:
column_list = [
    "recommendationid",
    "appid",
    "game",
    "author_steamid",
    "author_num_games_owned",
    "author_num_reviews",
    "author_playtime_forever",
    "author_playtime_last_two_weeks",
    "author_playtime_at_review",
    "author_last_played",
    "language",
    "review",
    "timestamp_created",
    "timestamp_updated",
    "voted_up",
    "votes_up",
    "votes_funny",
    "weighted_vote_score",
    "comment_count",
    "steam_purchase",
    "received_for_free",
    "written_during_early_access",
    "hidden_in_steam_china",
    "steam_china_location"
]

In [None]:
sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in column_list]).show(vertical=True) #shows nulls for each column

## 3. Review Text Preprocessing

In [None]:
def ascii_only(mystring): #strip out non-ascii characters
    if mystring:
        return mystring.encode('ascii', 'ignore').decode('ascii')
    else:
        return None
    
ascii_udf = udf(ascii_only) #UDF creation

In [None]:
sdf = sdf.withColumn('clean_review', ascii_udf('review')) # create udf to apply function throughout cluster

In [None]:
# Basic statistics for clean_review column
sdf.select('clean_review').summary('count', 'min', 'max').show()

In [None]:
sdf = sdf.withColumn('clean_review_length', length(col('clean_review'))) #create length variable to count length of string per record

In [None]:
sdf.select('clean_review_length').summary('count', 'min', 'max').show() #check cleaned reviews

## 4. Variable Distributions for Text Cleaning & Sentiment Modeling

In [None]:
# Frequency count of reviews by language
sdf.groupby('language').count().orderBy('count', ascending=False).show()

In [None]:
# Frequency count of reviews by game
sdf.groupby('game').count().orderBy('count', ascending=False).show()

In [None]:
# Frequency count of reviews by voted_up (target variable)
sdf.groupby('voted_up').count().orderBy('count', ascending=False).show(2)

In [None]:
# Take sample of data set to show overall trends
df_sample = sdf.select('language', 'voted_up').sample(fraction=0.005, seed=42).toPandas()

In [None]:
# Create dataframe containing only positive reviews for plotting
df_sample_1 = df_sample[df_sample['voted_up'] == 1]

In [None]:
# Create dataframe containing voted_up by language for plotting
lan_votes = df_sample_1.groupby('language')['voted_up'].count().sort_values(ascending=False)

In [None]:
# Reset indec for lan_vote and limit to top 10 languages
lan_votes_df = lan_votes.reset_index()
top_lan_votes = lan_votes_df.head(10)

In [None]:
# Plot top 10 languages by voted_up: Display which language leaves the most positive reviews 
plt.figure(figsize=(12, 6))
sns.barplot(data=top_lan_votes, x='language', y='voted_up', color='skyblue', edgecolor='black')
plt.title('Top 10 languages by Total Voted up')
plt.xticks(rotation=45)
plt.ylabel('Counts Voted up')
plt.tight_layout()
plt.show()

In [None]:
# Create dataframe containing only games and votes_funny for plotting
df_sample_2 = sdf.select('game', 'votes_funny').sample(fraction=0.005, seed=42).toPandas()

In [None]:
# Create dataframe where the game review was voted funny for plotting
df_sample_2_1 = df_sample_2[df_sample_2['votes_funny'] == 1]

In [None]:
# Create dataframe for votes funny by game for plotting
fun_votes = df_sample_2_1.groupby('game')['votes_funny'].count().sort_values(ascending=False)

In [None]:
# Reset index and limit to top 10 games
fun_votes_df = fun_votes.reset_index()
top_fun_votes = fun_votes_df.head(10)

In [None]:
# Plot top 10 games by votes_funny: Display which game has the most votes for being funny
plt.figure(figsize=(12,8))
sns.barplot(data=top_fun_votes, x='game', y='votes_funny', color='red', edgecolor='black')
plt.title('Top 10 games by count of votes_funny')
plt.xticks(rotation=45)
plt.ylabel('Total Votes Funny')
plt.tight_layout()
plt.show()
    