In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os

In [2]:
# Move up one level from 'notebooks/' to the project root
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define file paths
games_path = os.path.join(BASE_DIR, "data/raw/games.csv")
recommendations_path = os.path.join(BASE_DIR, "data/raw/recommendations.csv")
users_path = os.path.join(BASE_DIR, "data/raw/users.csv")

# Load CSV files into Pandas DataFrames
games_pd = pd.read_csv(games_path) if os.path.exists(games_path) else None
recommendations_pd = pd.read_csv(recommendations_path) if os.path.exists(recommendations_path) else None
users_pd = pd.read_csv(users_path) if os.path.exists(users_path) else None

# DATA EXPLORATION 

1. **users.csv** dataset

In [3]:
users_pd.head(10)

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2
5,5664667,145,5
6,5889167,447,2
7,7281762,1083,1
8,7445952,273,1
9,7462927,51,1


In [4]:
#check data type of features
users_pd.shape

(14306064, 3)

In [5]:
#@title Identify columns with missing values
users_pd_cp = users_pd.copy()
users_cols_with_nan = users_pd_cp.columns[users_pd_cp.isnull().any()].tolist()
print(f"Columns with missing values: {users_cols_with_nan}")

Columns with missing values: []


2. **games.csv** dataset

In [6]:
#@title First 5 rows
games_pd.head(5)

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [None]:
#@title Shape
games_pd.shape

(50872, 13)

In [None]:
#@title Data type
games_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50872 entries, 0 to 50871
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app_id          50872 non-null  int64  
 1   title           50872 non-null  object 
 2   date_release    50872 non-null  object 
 3   win             50872 non-null  bool   
 4   mac             50872 non-null  bool   
 5   linux           50872 non-null  bool   
 6   rating          50872 non-null  object 
 7   positive_ratio  50872 non-null  int64  
 8   user_reviews    50872 non-null  int64  
 9   price_final     50872 non-null  float64
 10  price_original  50872 non-null  float64
 11  discount        50872 non-null  float64
 12  steam_deck      50872 non-null  bool   
dtypes: bool(4), float64(3), int64(3), object(3)
memory usage: 3.7+ MB


In [None]:
#@title Identify columns with missing values
games_pd_cp = games_pd.copy()
games_cols_with_nan = games_pd_cp.columns[games_pd_cp.isnull().any()].tolist()
print(f"Columns with missing values: {games_cols_with_nan}")

Columns with missing values: []


3. **recommendations.csv** dataset

In [None]:
#@title First 5 rows
recommendations_pd.head(5)

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4


In [None]:
#@title Shape
recommendations_pd.shape

(41154794, 8)

In [None]:
#@title Identify columns with missing values
recommendations_pd_cp = recommendations_pd.copy()
recommendations_cols_with_nan = recommendations_pd_cp.columns[recommendations_pd_cp.isnull().any()].tolist()
print(f"Columns with missing values: {recommendations_cols_with_nan}")

Columns with missing values: []


# Data Overview

### **users.csv**

| Index | Column | Data Type | Missing Values |
|---|---|---|---|
| 0 | user_id | int64 | No |
| 1 | products | int64 | No |
| 2 | reviews | int64 | No |

**Shape:** (14306064, 3)  
**Memory Usage:** 327.4 MB


### **games.csv**

| Index | Column | Data Type | Missing Values |
|---|---|---|---|
| 0 | app_id | int64 | No |
| 1 | title | object | No |
| 2 | date_release | object | No |
| 3 | win | bool | No |
| 4 | mac | bool | No |
| 5 | linux | bool | No |
| 6 | rating | object | No |
| 7 | positive_ratio | int64 | No |
| 8 | user_reviews | int64 | No |
| 9 | price_final | float64 | No |
| 10 | price_original | float64 | No |
| 11 | discount | float64 | No |
| 12 | steam_deck | bool | No |

**Shape:** (50872, 13)  
**Memory Usage:** 3.7+ MB


### **recommendations.csv**

| Index | Column | Data Type | Missing Values |
|---|---|---|---|
| 0 | app_id | int64 | No |
| 1 | helpful | int64 | No |
| 2 | funny | int64 | No |
| 3 | date | object | No |
| 4 | is_recommended | bool | No |
| 5 | hours | float64 | No |
| 6 | user_id | int64 | No |
| 7 | review_id | int64 | No |

**Shape:** (41154794, 8)  
**Memory Usage:** 2.2+ GB


### **Relationships**

* **users** 1:N **recommendations** (One user can have many recommendations)
* **games** 1:N **recommendations** (One game can have many recommendations)
* **recommendations** N:1 **users** (Many recommendations belong to one user)
* **recommendations** N:1 **games** (Many recommendations belong to one game)



### **Overall**

The datasets have been loaded and their basic characteristics are summarized above. Further analysis will be conducted in the following sections.

# MERGING games.csv and games_metadata.json

- this is to ease working

In [4]:
import os
import pandas as pd
import json

# Move up one level from 'notebooks/' to the project root
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define file paths
games_path = os.path.join(BASE_DIR, "data/raw/games.csv")
metadata_path = os.path.join(BASE_DIR, "data/raw/games_metadata.json")
output_path = os.path.join(BASE_DIR, "data/interim/games_merged.csv")

# Load CSV file
if os.path.exists(games_path):
    df_games = pd.read_csv(games_path)
else:
    raise FileNotFoundError(f"CSV file not found: {games_path}")

# Load JSON file with NDJSON handling
if os.path.exists(metadata_path):
    with open(metadata_path, "r", encoding="utf-8") as f:
        try:
            metadata = json.load(f)  # Try loading as standard JSON
        except json.JSONDecodeError:
            # If JSONDecodeError occurs, assume it's NDJSON (Newline-Delimited JSON)
            f.seek(0)  # Reset file pointer
            metadata = [json.loads(line) for line in f]  # Read line by line
    
    df_metadata = pd.DataFrame(metadata)
else:
    raise FileNotFoundError(f"JSON file not found: {metadata_path}")

# Identify the common column (update if necessary)
common_column = "app_id"  # Change this if your dataset has a different key

# Merge both DataFrames on the common column
df_merged = df_games.merge(df_metadata, on=common_column, how="left")

# Save the merged data
os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Ensure directory exists
df_merged.to_csv(output_path, index=False)

print(f"Merged file saved at: {output_path}")


Merged file saved at: c:\Users\User\Documents\GitHub\Game-Recommender-System\data/interim/games_merged.csv


In [6]:
df_merged.head()  # Shows the first 5 rows


Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,description,tags
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."
