In [20]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from scipy import sparse
import re

In [21]:
df = pd.read_csv('games.csv')
df.sample(10)

Unnamed: 0.1,Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres,Rating,Plays,Playing,Backlogs,Wishlist,Lists,Reviews
4888,4888,Little Nemo: The Dream Master,"Sep 01, 1990",['Capcom'],Little Nemo: The Dream Master is a platform ga...,['NES'],"['Adventure', 'Platform']",3.3,276,2,79,37,79,16
6117,6117,Puffy no P.S. I Love You,TBD,[],Fan disc of the band Puffy (AKA Puffy Ami Yumi),[],['Music'],,0,0,1,2,3,0
50198,50198,Sword Art Online: Fatal Bullet - Dissonance of...,"Jan 18, 2019","['Dimps', 'Bandai Namco Entertainment']",Dissonance of the Nexus Expansion takes place ...,"['Windows PC', 'PlayStation 4', 'Xbox One']",['RPG'],,2,0,2,2,0,0
14726,14726,Neighbourhood Necromancer,"May 12, 2017",['Choice of Games'],Command the undead to take revenge on the subu...,"['Windows PC', 'Mac', 'Linux']","['Adventure', 'Indie', 'RPG']",,2,0,0,1,0,0
12314,12314,Belief & Betrayal,TBD,[],A point and click adventure game influenced he...,[],['Adventure'],,1,0,1,0,0,0
46397,46397,Footsies,"Jul 11, 2018",['HiFight'],FOOTSIES is a 2D fighting game where players c...,"['Windows PC', 'Android', 'iOS']",['Fighting'],3.4,92,1,9,16,14,7
36505,36505,Martian Gothic: Unification,"May 04, 2000",[],Martian Gothic : Unification is one of the num...,"['Windows PC', 'PlayStation']",['Adventure'],2.3,30,1,75,39,45,3
51012,51012,Togges,"Dec 07, 2022","['Thunderful', 'Regular Studio']",Togges is a collect-a-thon 3D Platformer about...,"['Windows PC', 'PlayStation 4', 'Xbox One', 'P...","['Adventure', 'Indie', 'Platform', 'Puzzle']",3.3,6,2,7,22,12,1
13537,13537,Kingdom Hearts Re:Chain of Memories,"Dec 02, 2008",['Square Enix'],Kingdom Hearts Re:Chain of Memories is a full ...,"['Windows PC', 'PlayStation 4', 'PlayStation 3...","['Adventure', 'RPG']",2.8,291,12,62,6,51,29
14628,14628,Treasure Hunter,"May 17, 2018",[],Dive into the exciting world of finding long f...,['Windows PC'],"['Indie', 'Simulator']",,3,0,0,0,0,0


In [22]:
df.shape

(60000, 14)

In [23]:
df.columns

Index(['Unnamed: 0', 'Title', 'Release_Date', 'Developers', 'Summary',
       'Platforms', 'Genres', 'Rating', 'Plays', 'Playing', 'Backlogs',
       'Wishlist', 'Lists', 'Reviews'],
      dtype='object')

In [24]:

df.drop(columns=['Rating','Lists','Plays','Playing',"Backlogs",'Wishlist','Reviews'], inplace=True)


In [25]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres
36618,36618,Puzzle of Jellies,"Dec 10, 2013","['Qrostar', 'Jelly Crew']",Originally released as Jelly no Puzzle for Win...,['Android'],['Puzzle']
32427,32427,UuultraC,"Mar 24, 2022","['ADELTA', 'MangaGamer']",An 18+ BL visual novel set in 1970's Soshigaya...,['Windows PC'],"['Adventure', 'Indie', 'Visual Novel']"
9769,9769,Blindia,"Oct 31, 2018",[],Blindia is an abstract top-down shooter puzzle...,['Windows PC'],['Indie']
25924,25924,Yet Another World,"Nov 30, 2015",[],Yet Another World - funny hardcore platformer ...,"['Windows PC', 'Mac', 'Linux']","['Adventure', 'Indie']"
24936,24936,Attack UFO,"Dec 31, 1974",['Taito'],,['Arcade'],[]


In [26]:
df.isna().sum()


Unnamed: 0         0
Title              0
Release_Date       0
Developers         0
Summary         4954
Platforms          0
Genres             0
dtype: int64

In [27]:
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

Number of duplicate rows: 0


In [28]:

df.dropna(inplace=True)

In [29]:
df.isna().sum()

Unnamed: 0      0
Title           0
Release_Date    0
Developers      0
Summary         0
Platforms       0
Genres          0
dtype: int64

In [30]:
df.to_csv("modified_games.csv")

In [31]:
df = pd.read_csv("modified_games.csv")
df.drop(columns=['Unnamed: 0','Unnamed: 0.1'], inplace=True)
df.head(2)


Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres
0,Elden Ring,"Feb 25, 2022","['FromSoftware', 'Bandai Namco Entertainment']","Elden Ring is a fantasy, action and open world...","['Windows PC', 'PlayStation 4', 'Xbox One', 'P...","['Adventure', 'RPG']"
1,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017","['Nintendo', 'Nintendo EPD Production Group No...",The Legend of Zelda: Breath of the Wild is the...,"['Wii U', 'Nintendo Switch']","['Adventure', 'Puzzle']"


In [32]:
df = df.drop_duplicates(subset=['Title'], keep='first')


In [33]:
print(df.shape)
print(f"Number of unique games: {df['Title'].nunique()}")

(37669, 6)
Number of unique games: 37669


### Cleaning Text(Handling text that appears in the format of a python list)

In [34]:
def parse_data(text):
    if not isinstance(text, str):
        return []
    if text.startswith('[') and text.endswith(']'):
        try:
            text = text.replace("'", "\"")
            parsed = json.loads(text)
            if isinstance(parsed, list):
                return parsed
        except:
            genres = re.findall(r"'([^']*)'", text)
            if genres:
                return genres
    if ',' in text:
        return [g.strip() for g in text.split(',')]
    return [text]

In [35]:
print("Preparing game data...")
df['genres_list'] = df['Genres'].apply(parse_data)
df['developers_list'] = df['Developers'].apply(parse_data)
df['game_content'] = df['Summary'].fillna('') 
df.sample(5)

Preparing game data...


Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres,genres_list,developers_list,game_content
51769,Blades of Time,"Mar 06, 2012","['Konami', 'Gaijin Entertainment']","Ayumi, the gorgeous gun and sword-wielding tre...","['Windows PC', 'Mac', 'Xbox 360', 'PlayStation...","['Adventure', 'Brawler', 'RPG', 'Shooter']","[Adventure, Brawler, RPG, Shooter]","[Konami, Gaijin Entertainment]","Ayumi, the gorgeous gun and sword-wielding tre..."
46947,Luminous Arc Infinity,"Aug 06, 2015","['Felistella', 'Marvelous']",The fourth game in the Luminous Arc series and...,['PlayStation Vita'],"['RPG', 'Turn Based Strategy']","[RPG, Turn Based Strategy]","[Felistella, Marvelous]",The fourth game in the Luminous Arc series and...
51559,Rooster,TBD,[],A 1994-1995 Amiga/PC top-down shooter.,[],['Shooter'],[Shooter],[],A 1994-1995 Amiga/PC top-down shooter.
52643,Full Moon,"Oct 20, 2009",['Bart Bonte'],"A rabbit wants fruit, but it's too dark to fin...",['Web browser'],['Puzzle'],[Puzzle],[Bart Bonte],"A rabbit wants fruit, but it's too dark to fin..."
16680,Narcissu 1st & 2nd,"Apr 24, 2015",['Sekai Project'],This is a story of disease and suffering; of m...,"['Windows PC', 'Mac', 'Linux']",['Indie'],[Indie],[Sekai Project],This is a story of disease and suffering; of m...


In [36]:
df['Content'] = df['Summary']
print("Building recommendation system...")
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['Content'])
sparse.save_npz('tfidf_matrix.npz', tfidf_matrix)

Building recommendation system...
