In [33]:
import polars as pl
import pandas as pd
import numpy as np
import os

# Load the Parquet file into a Polars DataFrame
polarsReviews = pl.read_parquet('../all_reviews_processed.parquet')

# Display the first 10 rows to understand the data structure
print(polarsReviews.head(10))


shape: (10, 9)
┌────────────┬───────┬────────────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ recommenda ┆ appid ┆ game       ┆ voted_up ┆ … ┆ author_pla ┆ author_pla ┆ author_la ┆ timestamp │
│ tionid     ┆ ---   ┆ ---        ┆ ---      ┆   ┆ ytime_last ┆ ytime_at_r ┆ st_played ┆ _created  │
│ ---        ┆ i64   ┆ str        ┆ i64      ┆   ┆ _two_weeks ┆ eview      ┆ ---       ┆ ---       │
│ i64        ┆       ┆            ┆          ┆   ┆ ---        ┆ ---        ┆ i64       ┆ i64       │
│            ┆       ┆            ┆          ┆   ┆ i64        ┆ i64        ┆           ┆           │
╞════════════╪═══════╪════════════╪══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 148919893  ┆ 10    ┆ Counter-St ┆ 1        ┆ … ┆ 197        ┆ 197        ┆ 169833636 ┆ 169833639 │
│            ┆       ┆ rike       ┆          ┆   ┆            ┆            ┆ 9         ┆ 7         │
│ 148919350  ┆ 10    ┆ Counter-St ┆ 1        ┆ … ┆ 37         ┆ 441        ┆

In [34]:
# Check for missing values (nulls) in each column
polarsReviews.select(pl.all().is_null().sum()).to_dicts()[0]


{'recommendationid': 0,
 'appid': 0,
 'game': 0,
 'voted_up': 0,
 'author_playtime_forever': 8,
 'author_playtime_last_two_weeks': 8,
 'author_playtime_at_review': 0,
 'author_last_played': 8,
 'timestamp_created': 0}

In [35]:
polarsReviews = polarsReviews.with_columns(
    pl.from_epoch(pl.col("timestamp_created"), time_unit="ms").alias("timestamp_created")
)

polarsReviews.head(10)

recommendationid,appid,game,voted_up,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,timestamp_created
i64,i64,str,i64,i64,i64,i64,i64,datetime[ms]
148919893,10,"""Counter-Strike""",1,197,197,197,1698336369,1970-01-20 15:45:36.397
148919350,10,"""Counter-Strike""",1,441,37,441,1698335809,1970-01-20 15:45:35.821
148913051,10,"""Counter-Strike""",1,1440,1440,1313,1698338635,1970-01-20 15:45:29.862
148912714,10,"""Counter-Strike""",1,1636,83,1612,1698341834,1970-01-20 15:45:29.555
148912575,10,"""Counter-Strike""",1,197,41,197,1698329401,1970-01-20 15:45:29.419
148906148,10,"""Counter-Strike""",1,1685,70,1649,1698323046,1970-01-20 15:45:21.840
148905699,10,"""Counter-Strike""",1,11,0,11,1695752100,1970-01-20 15:45:21.259
148900627,10,"""Counter-Strike""",1,45119,503,45119,1698157431,1970-01-20 15:45:14.419
148899121,10,"""Counter-Strike""",1,1271,973,1202,1698338304,1970-01-20 15:45:12.060
148898785,10,"""Counter-Strike""",1,721,0,721,1660831719,1970-01-20 15:45:11.451


In [36]:
# Number of unique games (Ensuring that there is a broad range of games to capture..... Hoping a large chunk of recent AAA games are captured)
polarsReviews.n_unique(subset=["appid"])

105893

In [37]:
# Grabbing Games DF For Publisher Info
json_path = "../games.json"
df = pd.read_json(json_path)
df = df.T
df.head()

Unnamed: 0,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,...,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,peak_ccu,tags
20200,Galactic Bowling,"Oct 21, 2008",0,19.99,0,Galactic Bowling is an exaggerated and stylize...,Galactic Bowling is an exaggerated and stylize...,Galactic Bowling is an exaggerated and stylize...,,https://cdn.akamai.steamstatic.com/steam/apps/...,...,,6,11,0 - 20000,0,0,0,0,0,"{'Indie': 22, 'Casual': 21, 'Sports': 21, 'Bow..."
655370,Train Bandit,"Oct 12, 2017",0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,THE LAW!! Looks to be a showdown atop a train....,THE LAW!! Looks to be a showdown atop a train....,,https://cdn.akamai.steamstatic.com/steam/apps/...,...,,53,5,0 - 20000,0,0,0,0,0,"{'Indie': 109, 'Action': 103, 'Pixel Graphics'..."
1732930,Jolt Project,"Nov 17, 2021",0,4.99,0,Jolt Project: The army now has a new robotics ...,Jolt Project: The army now has a new robotics ...,"Shoot vehicles, blow enemies with a special at...",,https://cdn.akamai.steamstatic.com/steam/apps/...,...,,0,0,0 - 20000,0,0,0,0,0,[]
1355720,Henosis™,"Jul 23, 2020",0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,HENOSIS™ is a mysterious 2D Platform Puzzler w...,HENOSIS™ is a mysterious 2D Platform Puzzler w...,,https://cdn.akamai.steamstatic.com/steam/apps/...,...,,3,0,0 - 20000,0,0,0,0,0,"{'2D Platformer': 161, 'Atmospheric': 154, 'Su..."
1139950,Two Weeks in Painland,"Feb 3, 2020",0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,ABOUT THE GAME Play as a hacker who has arrang...,Two Weeks in Painland is a story-driven game a...,,https://cdn.akamai.steamstatic.com/steam/apps/...,...,,50,8,0 - 20000,0,0,0,0,0,"{'Indie': 42, 'Adventure': 41, 'Nudity': 22, '..."


In [38]:
column_name_dict = {
"name": "Name",
"release_date": "Release date",
"required_age": "Required age",
"price": "Price",
"dlc_count": "DLC count",
"detailed_description": "Detailed description",
"about_the_game": "About the game",
"short_description": "Short description",
"reviews": "Reviews",
"header_image": "Header image",
"website": "Website",
"support_url": "Support url",
"support_email": "Support email",
"windows": "Windows",
"mac": "Mac",
"linux": "Linux",
"metacritic_score": "Metacritic score",
"metacritic_url": "Metacritic url",
"achievements": "Achievements",
"recommendations": "Recommendations",
"notes": "Notes",
"supported_languages": "Supported languages",
"full_audio_languages": "Full audio languages",
"packages": "Packages",
"developers": "Developers",
"publishers": "Publishers",
"categories": "Categories",
"genres": "Genres",
"screenshots": "Screenshots",
"movies": "Movies",
"user_score": "User score",
"score_rank": "Score rank",
"positive": "Positive",
"negative": "Negative",
"estimated_owners": "Estimated owners",
"average_playtime_forever": "Average playtime forever",
"average_playtime_2weeks": "Average playtime two weeks",
"median_playtime_forever": "Median playtime forever",
"median_playtime_2weeks": "Median playtime two weeks",
"peak_ccu": "Peak CCU",
"tags": "Tags"
}

In [39]:
def convert_dict_to_string(dict_object):
    # Used for converting the .json data into the format used in the .csv file
    # i.e. dict of "Tag: tag_id" into string with comma-separated tags
    if len(dict_object) == 0:
        return np.nan
    key_list = list(dict_object.keys())
    keys_string = ",".join(key_list)
    return keys_string

def read_convert_json_dataset():
    _df = pd.read_json(json_path)
    _df = _df.T
    _df['AppID'] = _df.index
    _df.rename(columns=column_name_dict, inplace=True)

    # Convert the dict/array columns
    for col in _df.columns.values:
        if isinstance(_df.loc[546560][col], dict):
            _df[col] = _df[col].apply(lambda entries: convert_dict_to_string(entries))
        if isinstance(_df.loc[546560][col], list):
            if isinstance(_df.loc[546560][col][0], str): # Avoids trying to convert the "Packages" column
                _df[col] = _df[col].apply(lambda entries: ",".join(entries))
    _df = _df.reset_index().set_index("AppID")
    return _df

df = read_convert_json_dataset()

df.to_csv("../cleanedgames.csv") # Saving the converted dataframe

In [40]:
# Exploring Publisher / Games Dataset
polarsGames = pl.read_csv("../cleanedgames.csv")



In [41]:
# Check for missing values (nulls) in each column
polarsGames.select(pl.all().is_null().sum()).to_dicts()[0]

{'AppID': 0,
 'index': 0,
 'Name': 6,
 'Release date': 0,
 'Required age': 0,
 'Price': 0,
 'DLC count': 0,
 'Detailed description': 4848,
 'About the game': 4870,
 'Short description': 4779,
 'Reviews': 87285,
 'Header image': 0,
 'Website': 54673,
 'Support url': 51463,
 'Support email': 16021,
 'Windows': 0,
 'Mac': 0,
 'Linux': 0,
 'Metacritic score': 0,
 'Metacritic url': 93457,
 'Achievements': 0,
 'Recommendations': 0,
 'Notes': 81936,
 'Supported languages': 4831,
 'Full audio languages': 57017,
 'Packages': 0,
 'Developers': 4873,
 'Publishers': 5099,
 'Categories': 5913,
 'Genres': 4841,
 'Screenshots': 2895,
 'Movies': 7891,
 'User score': 0,
 'Score rank': 97366,
 'Positive': 0,
 'Negative': 0,
 'Estimated owners': 0,
 'Average playtime forever': 0,
 'Average playtime two weeks': 0,
 'Median playtime forever': 0,
 'Median playtime two weeks': 0,
 'Peak CCU': 0,
 'Tags': 29763}

In [None]:
# Rename 'appid' to 'AppID' in df
polarsReviews = polarsReviews.rename({"appid": "AppID"})

# Select the relevant columns and drop rows with null values in them
filtered_polarsGames = polarsGames.select([
    "AppID", "Publishers", "Developers", "Categories", "Genres"
]).filter(
    pl.col("Publishers").is_not_null() &
    pl.col("Developers").is_not_null() &
    pl.col("Categories").is_not_null() &
    pl.col("Genres").is_not_null()
)


# Perform the inner join on 'AppID'
result = filtered_polarsGames.join(polarsReviews, on="AppID", how="inner")

# Display the result
result.head()



AppID,Publishers,Developers,Categories,Genres,recommendationid,game,voted_up,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,timestamp_created
i64,str,str,str,str,i64,str,i64,i64,i64,i64,i64,datetime[ms]
10,"""Valve""","""Valve""","""Multi-player,PvP,Online PvP,Sh…","""Action""",148919893,"""Counter-Strike""",1,197,197,197,1698336369,1970-01-20 15:45:36.397
10,"""Valve""","""Valve""","""Multi-player,PvP,Online PvP,Sh…","""Action""",148919350,"""Counter-Strike""",1,441,37,441,1698335809,1970-01-20 15:45:35.821
10,"""Valve""","""Valve""","""Multi-player,PvP,Online PvP,Sh…","""Action""",148913051,"""Counter-Strike""",1,1440,1440,1313,1698338635,1970-01-20 15:45:29.862
10,"""Valve""","""Valve""","""Multi-player,PvP,Online PvP,Sh…","""Action""",148912714,"""Counter-Strike""",1,1636,83,1612,1698341834,1970-01-20 15:45:29.555
10,"""Valve""","""Valve""","""Multi-player,PvP,Online PvP,Sh…","""Action""",148912575,"""Counter-Strike""",1,197,41,197,1698329401,1970-01-20 15:45:29.419


In [43]:
csv_output_file = "../FinalBaseDataset.parquet"

# Write the cleaned DataFrame to a new CSV file
result.write_parquet(csv_output_file)

print(f"Cleaned CSV saved to {csv_output_file}")

Cleaned CSV saved to ../FinalBaseDataset.parquet


In [None]:
result.n_unique(subset=["AppID"])

#Still Plenty of Games to work with!

70776