DATA PREPARATION FOR CONTENT-BASED FILTERING

In [3]:
import re
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix


Data Loading

In [4]:
games_df = pd.read_csv('../datasets/games.csv')

with open('../datasets/games_metadata.json', 'r', encoding='utf-8') as file:
    games_metadata = json.load(file)

Data Preprocessing

In [5]:
games_df = games_df.drop(['rating', 'user_reviews', 'price_original', 'price_final', 'discount', 'steam_deck'], axis=1)

# games_df.rename(columns={'price_final': 'price'}, inplace='True')

In [6]:
# merge the dataframe and metadata

games_metadata_df = pd.DataFrame(games_metadata)

games_df = pd.merge(games_df, games_metadata_df[['app_id', 'description', 'tags']], on='app_id', how='left')

games_df


Unnamed: 0,app_id,title,date_release,win,mac,linux,positive_ratio,description,tags
0,13500,Prince of Persia: Warrior Within‚Ñ¢,2008-11-21,True,False,False,84,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,85,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,92,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,61,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS‚Ñ¢,2014-10-27,True,True,False,88,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."
...,...,...,...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,2023-09-28,True,False,False,96,,[]
50868,1272080,PAYDAY 3,2023-09-21,True,False,False,38,,[]
50869,1402110,Eternights,2023-09-11,True,False,False,89,,[]
50870,2272250,Forgive Me Father 2,2023-10-19,True,False,False,95,Embark on a journey into the darkest nightmare...,"[Early Access, FPS, Action, Retro, First-Perso..."


In [7]:
# convert release date to year
games_df['year'] = pd.to_datetime(games_df['date_release']).dt.year
games_df = games_df.drop(['date_release'], axis=1)

# one hot encoding on the categorical values
games_df.rename(columns={'win': 'os_win', 'linux': 'os_linux', 'mac': 'os_mac'}, inplace='True')

games_df

Unnamed: 0,app_id,title,os_win,os_mac,os_linux,positive_ratio,description,tags,year
0,13500,Prince of Persia: Warrior Within‚Ñ¢,True,False,False,84,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre...",2008
1,22364,BRINK: Agents of Change,True,False,False,85,,[Action],2011
2,113020,Monaco: What's Yours Is Mine,True,True,True,92,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St...",2013
3,226560,Escape Dead Island,True,False,False,61,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P...",2014
4,249050,Dungeon of the ENDLESS‚Ñ¢,True,True,False,88,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra...",2014
...,...,...,...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,True,False,False,96,,[],2023
50868,1272080,PAYDAY 3,True,False,False,38,,[],2023
50869,1402110,Eternights,True,False,False,89,,[],2023
50870,2272250,Forgive Me Father 2,True,False,False,95,Embark on a journey into the darkest nightmare...,"[Early Access, FPS, Action, Retro, First-Perso...",2023


In [8]:
# merge the text columns
games_df['tags'] = games_df['tags'].apply(lambda x: ', '.join(x))

games_df['description_with_tags'] = games_df['description'] + games_df['tags']

games_df = games_df.drop(['description', 'tags'], axis=1)

def clean_spaces(text):
    # Remove multiple spaces and strip leading/trailing spaces
    return re.sub(r'\s+', ' ', text).strip()

games_df['title'] = games_df['title'].apply(clean_spaces)
games_df['description_with_tags'] = games_df['description_with_tags'].apply(clean_spaces)

games_df

Unnamed: 0,app_id,title,os_win,os_mac,os_linux,positive_ratio,year,description_with_tags
0,13500,Prince of Persia: Warrior Within‚Ñ¢,True,False,False,84,2008,Enter the dark underworld of Prince of Persia ...
1,22364,BRINK: Agents of Change,True,False,False,85,2011,Action
2,113020,Monaco: What's Yours Is Mine,True,True,True,92,2013,Monaco: What's Yours Is Mine is a single playe...
3,226560,Escape Dead Island,True,False,False,61,2014,Escape Dead Island is a Survival-Mystery adven...
4,249050,Dungeon of the ENDLESS‚Ñ¢,True,True,False,88,2014,Dungeon of the Endless is a Rogue-Like Dungeon...
...,...,...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,True,False,False,96,2023,
50868,1272080,PAYDAY 3,True,False,False,38,2023,
50869,1402110,Eternights,True,False,False,89,2023,
50870,2272250,Forgive Me Father 2,True,False,False,95,2023,Embark on a journey into the darkest nightmare...


In [9]:
# convert to label encoding to reduce dimensions of my feature set 
def label_encode_os(row):
    if row['os_win'] and not row['os_mac'] and not row['os_linux']:
        return 0  # Windows only
    elif not row['os_win'] and row['os_mac'] and not row['os_linux']:
        return 1  # Mac only
    elif not row['os_win'] and not row['os_mac'] and row['os_linux']:
        return 2  # Linux only
    elif row['os_win'] and row['os_mac'] and not row['os_linux']:
        return 3  # Windows + Mac
    elif row['os_win'] and not row['os_mac'] and row['os_linux']:
        return 4  # Windows + Linux
    elif not row['os_win'] and row['os_mac'] and row['os_linux']:
        return 5  # Mac + Linux
    elif row['os_win'] and row['os_mac'] and row['os_linux']:
        return 6  # Windows + Mac + Linux

games_df['os_label'] = games_df.apply(label_encode_os, axis=1)

games_df['os_label'] = games_df['os_label'].fillna(-1).astype(int)

games_df = games_df.drop(['os_win', 'os_mac', 'os_linux'], axis=1)

In [10]:
games_df.replace('', np.nan, inplace=True)

games_df

Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,Prince of Persia: Warrior Within‚Ñ¢,84,2008,Enter the dark underworld of Prince of Persia ...,0
1,22364,BRINK: Agents of Change,85,2011,Action,0
2,113020,Monaco: What's Yours Is Mine,92,2013,Monaco: What's Yours Is Mine is a single playe...,6
3,226560,Escape Dead Island,61,2014,Escape Dead Island is a Survival-Mystery adven...,0
4,249050,Dungeon of the ENDLESS‚Ñ¢,88,2014,Dungeon of the Endless is a Rogue-Like Dungeon...,3
...,...,...,...,...,...,...
50867,2296380,I Expect You To Die 3: Cog in the Machine,96,2023,,0
50868,1272080,PAYDAY 3,38,2023,,0
50869,1402110,Eternights,89,2023,,0
50870,2272250,Forgive Me Father 2,95,2023,Embark on a journey into the darkest nightmare...,0


In [11]:
games_df = games_df.dropna()

games_df

Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,Prince of Persia: Warrior Within‚Ñ¢,84,2008,Enter the dark underworld of Prince of Persia ...,0
1,22364,BRINK: Agents of Change,85,2011,Action,0
2,113020,Monaco: What's Yours Is Mine,92,2013,Monaco: What's Yours Is Mine is a single playe...,6
3,226560,Escape Dead Island,61,2014,Escape Dead Island is a Survival-Mystery adven...,0
4,249050,Dungeon of the ENDLESS‚Ñ¢,88,2014,Dungeon of the Endless is a Rogue-Like Dungeon...,3
...,...,...,...,...,...,...
50845,2455060,Taboo Trial,94,2023,"In the rogue action game ""Taboo Trial"", you wi...",0
50847,1138640,Hometopia,61,2023,"Build better, together üè° Hometopia is a seriou...",0
50848,2515460,Northgard - Kernev Clan of the Stoat,67,2023,"Strategy, Indie, Simulation",6
50857,1687000,Fading Afternoon,79,2023,Seiji Maruyama is a middle-aged yakuza recentl...,0


In [12]:
games_df.to_csv('../datasets/output.csv', index=False)

In [13]:
# preprocess text 
def remove_non_alphabetic_characters(df):
    pattern = r'[^a-zA-Z\s]'
    df['description_with_tags'] = df['description_with_tags'].apply(lambda x: re.sub(pattern, ' ', x))
    df['title'] = df['title'].apply(lambda x: re.sub(pattern, ' ', x))
    return df

def convert_to_lower(df):
    df['description_with_tags'] = df['description_with_tags'].str.lower()
    df['title'] = df['title'].str.lower()
    return df

games_df = remove_non_alphabetic_characters(games_df)
games_df = convert_to_lower(games_df)

games_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description_with_tags'] = df['description_with_tags'].apply(lambda x: re.sub(pattern, ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(lambda x: re.sub(pattern, ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description_with_tags'] = df['descr

Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,prince of persia warrior within,84,2008,enter the dark underworld of prince of persia ...,0
1,22364,brink agents of change,85,2011,action,0
2,113020,monaco what s yours is mine,92,2013,monaco what s yours is mine is a single playe...,6
3,226560,escape dead island,61,2014,escape dead island is a survival mystery adven...,0
4,249050,dungeon of the endless,88,2014,dungeon of the endless is a rogue like dungeon...,3
...,...,...,...,...,...,...
50845,2455060,taboo trial,94,2023,in the rogue action game taboo trial you wi...,0
50847,1138640,hometopia,61,2023,build better together hometopia is a seriou...,0
50848,2515460,northgard kernev clan of the stoat,67,2023,strategy indie simulation,6
50857,1687000,fading afternoon,79,2023,seiji maruyama is a middle aged yakuza recentl...,0


In [14]:
def clean_spaces(text):
    # Remove multiple spaces and strip leading/trailing spaces
    return re.sub(r'\s+', ' ', text).strip()

games_df['title'] = games_df['title'].apply(clean_spaces)
games_df['description_with_tags'] = games_df['description_with_tags'].apply(clean_spaces)

games_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['title'] = games_df['title'].apply(clean_spaces)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['description_with_tags'] = games_df['description_with_tags'].apply(clean_spaces)


Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,prince of persia warrior within,84,2008,enter the dark underworld of prince of persia ...,0
1,22364,brink agents of change,85,2011,action,0
2,113020,monaco what s yours is mine,92,2013,monaco what s yours is mine is a single player...,6
3,226560,escape dead island,61,2014,escape dead island is a survival mystery adven...,0
4,249050,dungeon of the endless,88,2014,dungeon of the endless is a rogue like dungeon...,3
...,...,...,...,...,...,...
50845,2455060,taboo trial,94,2023,in the rogue action game taboo trial you will ...,0
50847,1138640,hometopia,61,2023,build better together hometopia is a seriously...,0
50848,2515460,northgard kernev clan of the stoat,67,2023,strategy indie simulation,6
50857,1687000,fading afternoon,79,2023,seiji maruyama is a middle aged yakuza recentl...,0


In [15]:
games_df.to_csv('../datasets/training_set.csv', index=False)

In [16]:
games_main_df = pd.read_csv('../datasets/training_set.csv')

games_main_df

Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,prince of persia warrior within,84,2008,enter the dark underworld of prince of persia ...,0
1,22364,brink agents of change,85,2011,action,0
2,113020,monaco what s yours is mine,92,2013,monaco what s yours is mine is a single player...,6
3,226560,escape dead island,61,2014,escape dead island is a survival mystery adven...,0
4,249050,dungeon of the endless,88,2014,dungeon of the endless is a rogue like dungeon...,3
...,...,...,...,...,...,...
49638,2455060,taboo trial,94,2023,in the rogue action game taboo trial you will ...,0
49639,1138640,hometopia,61,2023,build better together hometopia is a seriously...,0
49640,2515460,northgard kernev clan of the stoat,67,2023,strategy indie simulation,6
49641,1687000,fading afternoon,79,2023,seiji maruyama is a middle aged yakuza recentl...,0
