In [1]:
import os
import sys

curr_dir = os.getcwd()
pkg_dir = os.path.join(os.path.dirname(curr_dir), "steam_sales", "steam_etl")
sys.path.append(pkg_dir)

In [2]:
import itertools
import json
import math
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from db import get_db
from settings import Path
from sqlalchemy import text

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

### Fetch Data from Database

In [3]:
with open(os.path.join(Path.sql_queries, "get_all_game_data.sql"), "r") as f:
    query = text(f.read())


with get_db() as db:
    result = db.execute(query)

data = result.fetchall()
columns = result.keys()
game_data = pd.DataFrame(data, columns=columns)


game_data.head()

Unnamed: 0,name,appid,required_age,controller_support,dlc,requirements,platform,metacritic,categories,genres,recommendations,achievements,release_date,coming_soon,english,developer,publisher,price,description,year,month,day,positive_ratings,negative_ratings,owners_in_millions,average_forever,median_forever,languages,steamspy_tags
0,Counter-Strike,10,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,88,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,150423,0,2000-11-01,0,1,Valve,Valve,9.99,Play the world's number 1 online action game. ...,2000.0,11.0,1.0,232593,6098,10.0 - 20.0,16697,219,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 4902, ""PvP"": 910, ""1980s"": 279, ""1990'..."
1,Team Fortress Classic,20,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,6133,0,1999-04-01,0,1,Valve,Valve,4.99,One of the most popular online action games of...,1999.0,4.0,1.0,7181,1089,5.0 - 10.0,1078,16,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 327, ""Mod"": 39, ""Co-op"": 98, ""Funny"": ..."
2,Day of Defeat,30,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,79,Multi-player;Valve Anti-Cheat enabled;Family S...,Action,4074,0,2003-05-01,0,1,Valve,Valve,4.99,Enlist in an intense brand of Axis vs. Allied ...,2003.0,5.0,1.0,6166,671,5.0 - 10.0,187,41,"English, French, German, Italian, Spanish - Spain","{""FPS"": 798, ""War"": 158, ""Co-op"": 36, ""Retro"":..."
3,Deathmatch Classic,40,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,2149,0,2001-06-01,0,1,Valve,Valve,4.99,Enjoy fast-paced multiplayer gaming with Death...,2001.0,6.0,1.0,2479,522,5.0 - 10.0,66,11,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 150, ""Gore"": 20, ""Co-op"": 16, ""Retro"":..."
4,Half-Life: Opposing Force,50,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,19244,0,1999-11-01,0,1,Gearbox Software,Valve,4.99,Return to the Black Mesa Research Facility as ...,1999.0,11.0,1.0,21457,1098,2.0 - 5.0,431,122,"English, French, German, Korean","{""FPS"": 919, ""Gore"": 58, ""Co-op"": 43, ""Retro"":..."


In [7]:
game_data["description"].iloc[10000 - 4 - 1]

'Prismata\'s new free-to-play edition is\nradically fair\n, featuring\nno card packs, no grinding, and absolutely no pay-to-win\n. We steadfastly refuse to sell items that provide advantages in competitive multiplayer. Instead, you can support us by purchasing cosmetics or premium single-player content. Thank you!\nPrepare yourself for the ultimate strategy game.\nPrismata—the gripping debut from Lunarch Studios—radically redefines the strategy genre by combining elements of real-time strategy, card games, and tabletop strategy games to create something entirely new and unique.\nCompeting against other humans or ruthlessly efficient AI, you\'ll plan devastating attacks through a series of quick-fire turns. Will you outwit your opponents by striking at the perfect moment? Or will you fall victim to a devious trap?\nGame Modes\nCampaign:\nan enthralling 40-mission story-based single player adventure\nCasual Match:\nplay friendly games against other humans or one of 30 different AI person

### Preprocess Data

Some values of geners and categories need to be translated to english

In [7]:
from fuzzywuzzy import process


def get_unique(series):
    """
    Returns a set of unique values from a series of strings.

    Parameters:
    series (pandas.Series): A series of strings.

    Returns:
    set: A set of unique values extracted from the series.

    """
    return set(list(itertools.chain(*series.apply(lambda x: [c for c in x.split(";")]))))

In [11]:
geners = get_unique(game_data["genres"])
geners

{'Accounting',
 'Action',
 'Adventure',
 'Adventure games',
 'Animation & Modeling',
 'Audio Production',
 'Casual',
 'Design & Illustration',
 'Early Access',
 'Education',
 'Free To Play',
 'Free to Play',
 'Game Development',
 'Gore',
 'Indie',
 'Massively Multiplayer',
 'Movie',
 'Nudity',
 'Photo Editing',
 'RPG',
 'Racing',
 'Role',
 'Role Playing (RPG)',
 'Sexual Content',
 'Short',
 'Simulation',
 'Simulators',
 'Software Training',
 'Sports',
 'Strategy',
 'Utilities',
 'Video Production',
 'Violent',
 'Web Publishing',
 'action'}

In [12]:
def standardize_genre(value, genre_list):
    # Convert to lowercase for consistent comparison
    value_lower = value.lower()

    # Define common patterns
    if "rpg" in value_lower or "role playing" in value_lower or "role" in value_lower:
        return "RPG"
    if "simulation" in value_lower or "simulators" in value_lower:
        return "Simulation"
    if "adventure" in value_lower:
        return "Adventure"


# Function to standardize multiple genres
def standardize_multiple_genres(genres_str, genre_list):
    genres = genres_str.split(";")
    standardized_genres = [standardize_genre(genre.strip(), genre_list) for genre in genres]
    return ";".join(sorted(set(standardized_genres)))  # Use sorted(set()) to remove duplicates and sort

    # Find the best match from the list of unique genres
    match, score = process.extractOne(value, genre_list)
    return match


# Apply the standardization function to the Genres column
game_data["genres"] = game_data["genres"].apply(lambda x: standardize_multiple_genres(x, geners))
geners = get_unique(game_data["genres"])
geners

TypeError: sequence item 0: expected str instance, NoneType found

In [6]:
categories = get_unique(game_data["categories"])
categories

{' Co-op',
 ' Family Sharing',
 ' Multiplayer',
 ' Partial controller support',
 ' Remote play on tablet',
 ' Steam Achievements',
 ' Steam trading cards',
 ' Valve Anti-Cheat enabled',
 ' co-op',
 ' cross-platform multiplayer',
 ' in-app purchases',
 ' multiplayer',
 ' online co-op',
 ' partial controller support',
 'Captions available',
 'Co-op',
 'Collectible cards',
 'Commentary available',
 'Controller (Full)',
 'Controller (partial)',
 'Cross-Platform Multiplayer',
 'Cross-platform multiplayer',
 'Family Library',
 'Family Library Sharing',
 'Family Sharing',
 'Full Controller Support',
 'Full controller support',
 'HDR available',
 'In-App Purchases',
 'In-app purchases',
 'Includes Level Editor',
 'Includes Source SDK',
 'Includes level editor',
 'LAN Co-op',
 'LAN PvP',
 'LAN co-op',
 'Library sharing',
 'MMO',
 'Mods',
 'Mods (require HL2)',
 'Motion detection on controller',
 'Multi-player',
 'Multiplayer',
 'Multiple',
 'Online Co-op',
 'Online PvP',
 'Online co-op',
 'Part

Creating a rating score from positive and negative ratings using [SteamDB's](https://steamdb.info/blog/steamdb-rating/) method

In [29]:
def calc_rating(row):
    """
    Calculate the rating score for a given row.

    Parameters:
    - row: A dictionary representing a row of data with 'positive_ratings' and 'negative_ratings' keys.

    Returns:
    - score: The calculated rating score as a percentage.

    """
    pos = row["positive_ratings"]
    neg = row["negative_ratings"]

    total_reviews = pos + neg

    if total_reviews > 0:
        average = pos / total_reviews
        score = average - (average * 0.5) * 2 ** (-math.log10(total_reviews + 1))
        return score * 100
    else:
        return 0.0


game_data["total_ratings"] = game_data["positive_ratings"] + game_data["negative_ratings"]
game_data["review_score"] = game_data["positive_ratings"] / game_data["total_ratings"]
game_data["rating"] = game_data.apply(calc_rating, axis=1)

In [30]:
game_data.head()

Unnamed: 0,name,appid,required_age,controller_support,dlc,requirements,platform,metacritic,categories,genres,recommendations,achievements,release_date,coming_soon,english,developer,publisher,price,description,year,month,day,positive_ratings,negative_ratings,owners_in_millions,average_forever,median_forever,languages,steamspy_tags,total_ratings,rating_ratio,rating
0,Counter-Strike,10,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,88,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,150423,0,2000-11-01,0,1,Valve,Valve,9.99,Play the world's number 1 online action game. ...,2000.0,11.0,1.0,232593,6098,10.0 - 20.0,16697,219,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 4902, ""PvP"": 910, ""1980s"": 279, ""1990'...",238691,0.974452,96.273469
1,Team Fortress Classic,20,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,6133,0,1999-04-01,0,1,Valve,Valve,4.99,One of the most popular online action games of...,1999.0,4.0,1.0,7181,1089,5.0 - 10.0,1078,16,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 327, ""Mod"": 39, ""Co-op"": 98, ""Funny"": ...",8270,0.868319,83.958848
2,Day of Defeat,30,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,79,Multi-player;Valve Anti-Cheat enabled;Family S...,Action,4074,0,2003-05-01,0,1,Valve,Valve,4.99,Enlist in an intense brand of Axis vs. Allied ...,2003.0,5.0,1.0,6166,671,5.0 - 10.0,187,41,"English, French, German, Italian, Spanish - Spain","{""FPS"": 798, ""War"": 158, ""Co-op"": 36, ""Retro"":...",6837,0.901858,87.025811
3,Deathmatch Classic,40,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,2149,0,2001-06-01,0,1,Valve,Valve,4.99,Enjoy fast-paced multiplayer gaming with Death...,2001.0,6.0,1.0,2479,522,5.0 - 10.0,66,11,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 150, ""Gore"": 20, ""Co-op"": 16, ""Retro"":...",3001,0.826058,78.897492
4,Half-Life: Opposing Force,50,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,19244,0,1999-11-01,0,1,Gearbox Software,Valve,4.99,Return to the Black Mesa Research Facility as ...,1999.0,11.0,1.0,21457,1098,2.0 - 5.0,431,122,"English, French, German, Korean","{""FPS"": 919, ""Gore"": 58, ""Co-op"": 43, ""Retro"":...",22555,0.951319,92.8047


### EDA

In [None]:
def categorize_year(year):
    if year < 2020:
        return "Before 2020"
    elif 2020 <= year <= 2022:
        return "2020-2022"
    else:
        return "After 2022"


game_data["year"] = game_data["year"].fillna(0).astype(int)
game_data["Region"] = game_data["year"].apply(categorize_year)

# Calculate the frequency of each year
yearly_counts = game_data.groupby(["Region", "year"]).size().reset_index(name="Frequency")

# Plotting using Seaborn
plt.figure(figsize=(12, 6))
sns.barplot(data=yearly_counts, x="year", y="Frequency", hue="Region")
plt.title("Game Release by Year")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

Tags

In [None]:
tags = col_row_df["tags"]
parsed_tags = tags.apply(lambda x: literal_eval(x) if x else {})

unique_tags = set(itertools.chain(*parsed_tags))

print("Number of unique tags:", len(unique_tags))

# Create a DataFrame with 15 columns and 30 rows
num_columns = 15
num_rows = 30

unique_tags = sorted(list(unique_tags))

# Reshape the list into the desired DataFrame shape
ut = [unique_tags[i * num_columns : (i + 1) * num_columns] for i in range(num_rows)]

# Create the DataFrame
utdf = pd.DataFrame(ut)
utdf

In [None]:
tagc = Counter()

for tag_list in parsed_tags:
    tagc.update(tag_list.keys())

tagc

Languages

In [None]:
langs = col_row_df["languages"]
langs = langs.apply(lambda x: x.split(", ") if x else [])

langc = Counter()

for l in langs:
    langc.update(l)

langc