In [8]:
import re
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix


Data Loading

In [9]:
games_df = pd.read_csv('datasets/games.csv')
users_df = pd.read_csv('datasets/users.csv')

with open('datasets/games_metadata.json', 'r', encoding='utf-8') as file:
    games_metadata = json.load(file)

Data Preprocessing

In [10]:
games_df = games_df.drop(['rating', 'user_reviews', 'price_original', 'price_final', 'discount', 'steam_deck'], axis=1)

# games_df.rename(columns={'price_final': 'price'}, inplace='True')

In [11]:
# merge the dataframe and metadata

games_metadata_df = pd.DataFrame(games_metadata)

games_df = pd.merge(games_df, games_metadata_df[['app_id', 'description', 'tags']], on='app_id', how='left')

games_df


Unnamed: 0,app_id,title,date_release,win,mac,linux,positive_ratio,description,tags
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,84,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,85,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,92,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,61,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,88,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."
...,...,...,...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,2023-09-28,True,False,False,96,,[]
50868,1272080,PAYDAY 3,2023-09-21,True,False,False,38,,[]
50869,1402110,Eternights,2023-09-11,True,False,False,89,,[]
50870,2272250,Forgive Me Father 2,2023-10-19,True,False,False,95,Embark on a journey into the darkest nightmare...,"[Early Access, FPS, Action, Retro, First-Perso..."


In [12]:
# convert release date to year
games_df['year'] = pd.to_datetime(games_df['date_release']).dt.year
games_df = games_df.drop(['date_release'], axis=1)

# one hot encoding on the categorical values
games_df.rename(columns={'win': 'os_win', 'linux': 'os_linux', 'mac': 'os_mac'}, inplace='True')

games_df

Unnamed: 0,app_id,title,os_win,os_mac,os_linux,positive_ratio,description,tags,year
0,13500,Prince of Persia: Warrior Within™,True,False,False,84,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre...",2008
1,22364,BRINK: Agents of Change,True,False,False,85,,[Action],2011
2,113020,Monaco: What's Yours Is Mine,True,True,True,92,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St...",2013
3,226560,Escape Dead Island,True,False,False,61,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P...",2014
4,249050,Dungeon of the ENDLESS™,True,True,False,88,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra...",2014
...,...,...,...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,True,False,False,96,,[],2023
50868,1272080,PAYDAY 3,True,False,False,38,,[],2023
50869,1402110,Eternights,True,False,False,89,,[],2023
50870,2272250,Forgive Me Father 2,True,False,False,95,Embark on a journey into the darkest nightmare...,"[Early Access, FPS, Action, Retro, First-Perso...",2023


In [13]:
# merge the text columns
games_df['tags'] = games_df['tags'].apply(lambda x: ', '.join(x))

games_df['description_with_tags'] = games_df['description'] + games_df['tags']

games_df = games_df.drop(['description', 'tags'], axis=1)

# preprocess text 
def remove_non_alphabetic_characters(df):
    pattern = r'[^a-zA-Z\s]'
    df['description_with_tags'] = df['description_with_tags'].apply(lambda x: re.sub(pattern, ' ', x))
    df['title'] = df['title'].apply(lambda x: re.sub(pattern, ' ', x))
    return df

def convert_to_lower(df):
    df['description_with_tags'] = df['description_with_tags'].str.lower()
    df['title'] = df['title'].str.lower()
    return df

games_df = remove_non_alphabetic_characters(games_df)
games_df = convert_to_lower(games_df)

games_df

Unnamed: 0,app_id,title,os_win,os_mac,os_linux,positive_ratio,year,description_with_tags
0,13500,prince of persia warrior within,True,False,False,84,2008,enter the dark underworld of prince of persia ...
1,22364,brink agents of change,True,False,False,85,2011,action
2,113020,monaco what s yours is mine,True,True,True,92,2013,monaco what s yours is mine is a single playe...
3,226560,escape dead island,True,False,False,61,2014,escape dead island is a survival mystery adven...
4,249050,dungeon of the endless,True,True,False,88,2014,dungeon of the endless is a rogue like dungeon...
...,...,...,...,...,...,...,...,...
50867,2296380,i expect you to die cog in the machine,True,False,False,96,2023,
50868,1272080,payday,True,False,False,38,2023,
50869,1402110,eternights,True,False,False,89,2023,
50870,2272250,forgive me father,True,False,False,95,2023,embark on a journey into the darkest nightmare...


In [14]:
games_df.to_csv('dataset.csv', index=False)

In [15]:
# Separate features
text_data = games_df['description_with_tags']
numeric_data = games_df[['positive_ratio', 'year']]
categorical_data = games_df[['os_win', 'os_mac',  'os_linux']]


In [16]:
# preprocess text features using tf-idf
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(text_data)

In [17]:
# preprocess numerical features - standard scaling and then convert to sparse matrix
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(numeric_data)
scaled_numeric_sparse = csr_matrix(scaled_numeric)


In [18]:
# preprocess categorical data - convert already one hot encoded data (0 or 1) into sparse matrix
categorical_data = categorical_data.astype(int)
categorical_data_sparse = csr_matrix(categorical_data)

In [19]:
# Combine all features in horizontal stack
combined_features = hstack([tfidf_matrix, scaled_numeric_sparse, categorical_data_sparse])

print(combined_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1487206 stored elements and shape (50872, 43398)>
  Coords	Values
  (0, 12120)	0.10850476791337127
  (0, 8940)	0.18812173378739436
  (0, 40064)	0.13617153438131152
  (0, 29308)	0.41940916733820116
  (0, 27899)	0.3506077270381994
  (0, 41702)	0.12625834871069555
  (0, 37329)	0.12033653609738065
  (0, 34689)	0.17390273969173972
  (0, 33600)	0.1086326591761654
  (0, 8294)	0.13562872620199093
  (0, 191)	0.12582863658643276
  (0, 32826)	0.16623680568190052
  (0, 38552)	0.1857087135544481
  (0, 17937)	0.14838032377157775
  (0, 8817)	0.21069312804682094
  (0, 18372)	0.14983538920354753
  (0, 18522)	0.18992219328840657
  (0, 13275)	0.11184169039584937
  (0, 33440)	0.13673038845497232
  (0, 10437)	0.1506071910426809
  (0, 31607)	0.17846227484350963
  (0, 11744)	0.15313180493332534
  (0, 27534)	0.11052038695782188
  (0, 5675)	0.1576913400850952
  (0, 24994)	0.07001475043802376
  :	:
  (50870, 11284)	0.09011444513234562
  (50870, 179)	

In [20]:
# inspect your csr (compressed sparse row) matrix 
dense_array = combined_features.toarray()

print(dense_array)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


Model Building - Task 3
For a given game, find top 5 similar games based on description