In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

import re
import tensorflow as tf
from tensorflow import keras
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

Link to Dataset:
[Dataset](https://www.kaggle.com/datasets/fronkongames/steam-games-dataset/data)

In [None]:
df = pd.read_csv('games.csv')
df.head(1)

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [4]:
df.shape

(85103, 39)

In [5]:
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [6]:
df['Required age'].value_counts().shape

(19,)

In [7]:
df['User score'].value_counts().shape

# The User score is too much dominated towards 0, so better to remove it

(33,)

In [8]:
df['Metacritic score'].value_counts().shape

(73,)

In [9]:
df['Notes'].isnull().sum()

72082

In [10]:
columns = ['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies']

useful_columns = ['AppID', 'Name', 'About the game', 'Reviews', 'Genres', 'Tags', 'Categories', 'Developers']

In [11]:
df1 = df[useful_columns]

In [12]:
df1.shape

(85103, 8)

In [13]:
df1.dtypes

AppID              int64
Name              object
About the game    object
Reviews           object
Genres            object
Tags              object
Categories        object
Developers        object
dtype: object

In [14]:
for i in useful_columns:
    print(f"{i} : {df1[i].isnull().sum()}")

AppID : 0
Name : 6
About the game : 3567
Reviews : 75360
Genres : 3555
Tags : 21100
Categories : 4598
Developers : 3587


In [15]:
df1.drop(columns=['Reviews'], inplace=True)
df1.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(columns=['Reviews'], inplace=True)


(85103, 7)

In [16]:
df1.Tags[1]

'Indie,Action,Pixel Graphics,2D,Retro,Arcade,Score Attack,Minimalist,Comedy,Singleplayer,Fast-Paced,Casual,Funny,Parody,Difficult,Gore,Violent,Western,Controller,Blood'

In [17]:
df1.Genres[1]

'Action,Indie'

Tags have a lot of things about the game, which might be too much for our model. So, we will keep 'Genres' and remove 'Tags' + 'Tags' have a lot of null values compare to 'Genres'

In [18]:
df1.drop(columns=['Tags'], inplace=True)
df1.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(columns=['Tags'], inplace=True)


(85103, 6)

In [19]:
df1.Categories[0]

'Single-player,Multi-player,Steam Achievements,Partial Controller Support'

In [20]:
df1.loc[:, 'Genres'] = df1['Genres'].apply(lambda x: x.split(',') if isinstance(x, str) else x)
df1.loc[:, 'Categories'] = df1['Categories'].apply(lambda x: x.split(',') if isinstance(x, str) else x)

In [21]:
df1.Genres[1]

['Action', 'Indie']

In [22]:
df1.dtypes

AppID              int64
Name              object
About the game    object
Genres            object
Categories        object
Developers        object
dtype: object

In [23]:
df1.dropna(inplace=True)
df1.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.dropna(inplace=True)


(80186, 6)

In [24]:
df1.loc[:, 'Genres'] = df1['Genres'].apply(lambda x: x[0])
df1.loc[:, 'Categories'] = df1['Categories'].apply(lambda x: x[0])

In [25]:
df1.head()

Unnamed: 0,AppID,Name,About the game,Genres,Categories,Developers
0,20200,Galactic Bowling,Galactic Bowling is an exaggerated and stylize...,Casual,Single-player,Perpetual FX Creative
1,655370,Train Bandit,THE LAW!! Looks to be a showdown atop a train....,Action,Single-player,Rusty Moyher
2,1732930,Jolt Project,Jolt Project: The army now has a new robotics ...,Action,Single-player,Campião Games
3,1355720,Henosis™,HENOSIS™ is a mysterious 2D Platform Puzzler w...,Adventure,Single-player,Odd Critter Games
4,1139950,Two Weeks in Painland,ABOUT THE GAME Play as a hacker who has arrang...,Adventure,Single-player,Unusual Games


In [26]:
df1.Categories.nunique()

25

In [27]:
df1.Genres.nunique()

27

In [28]:
df1.loc[:, 'About the game'] = (
    df1.loc[:, 'About the game'].astype(str) + " " +
    df1.loc[:, 'Genres'].astype(str) + " " +
    df1.loc[:, 'Categories'].astype(str) + " " +
    df1.loc[:, 'Developers'].astype(str)
)

In [29]:
df1.drop(columns=['Categories', 'Genres', 'Developers'], inplace=True)
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(columns=['Categories', 'Genres', 'Developers'], inplace=True)


Unnamed: 0,AppID,Name,About the game
0,20200,Galactic Bowling,Galactic Bowling is an exaggerated and stylize...
1,655370,Train Bandit,THE LAW!! Looks to be a showdown atop a train....
2,1732930,Jolt Project,Jolt Project: The army now has a new robotics ...
3,1355720,Henosis™,HENOSIS™ is a mysterious 2D Platform Puzzler w...
4,1139950,Two Weeks in Painland,ABOUT THE GAME Play as a hacker who has arrang...


In [30]:
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'</br>', ' ', text).strip()
    # tokens = word_tokenize(text)
    # text = ' '.join(stemmer.stem(word) for word in tokens if word not in stopwords)
    return text

In [31]:
df1.loc[:, 'About the game'] = df1.loc[:, 'About the game'].apply(clean_text)

In [32]:
# Path to the word tokenizer (usually Treebank tokenizer is used for word tokenization)
path = 'C:/Users/Ayush R/AppData/Roaming/nltk_data/tokenizers/punkt/english.pickle'

with open(path, 'rb') as f:
    tokenizer = pickle.load(f)

# Initialize Treebank Word Tokenizer (uses the punkt tokenizer for word tokenization)
word_tokenizer = TreebankWordTokenizer()

df1.loc[:, 'About the game'] = df1.loc[:, 'About the game'].apply(lambda x: word_tokenizer.tokenize(x))

In [33]:
ps = PorterStemmer()

def stem_words(text):
    if isinstance(text, str):  
        return " ".join([ps.stem(word) for word in text.split()])
    return text  

df1.loc[:, 'About the game'] = df1.loc[:, 'About the game'].apply(stem_words)

In [34]:
df1.loc[:, 'Name'] = df1.loc[:, 'Name'].apply(clean_text)

In [35]:
df1.loc[:, 'About the game'] = df1.loc[:, 'About the game'].apply(lambda x: ' '.join(x))

In [36]:
df1.head()

Unnamed: 0,AppID,Name,About the game
0,20200,galactic bowling,galactic bowling is an exaggerated and stylize...
1,655370,train bandit,the law looks to be a showdown atop a train th...
2,1732930,jolt project,jolt project the army now has a new robotics p...
3,1355720,henosis,henosis is a mysterious 2d platform puzzler wh...
4,1139950,two weeks in painland,about the game play as a hacker who has arrang...


In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000,stop_words='english')

In [38]:
vectors = cv.fit_transform(df1['About the game']).toarray()
vectors.shape

(80186, 3000)

In [39]:
#Yaha par similarity matrix ki jo dimension hai vo (80186, 80186) hogi which will be very big. So, agar similarity matrix banana hai, toh best hai ki samples le lo.

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

# Reduce dimensions to 300 components
svd = TruncatedSVD(n_components=300, random_state=42)
reduced_vectors = svd.fit_transform(vectors)

# Fit the NearestNeighbors model with reduced dimensions
ann_model = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=10)
ann_model.fit(reduced_vectors)

# Find the nearest neighbors for each sample
distances, indices = ann_model.kneighbors(reduced_vectors)
print(indices) 

[[    0  9890 18348 ...  7189 37721 16466]
 [    1 66082 58798 ... 78521  7450 52534]
 [    2 68063 40121 ... 36598 38973 25294]
 ...
 [80183 61644 70678 ... 36569  1533 36173]
 [80184 33022 63471 ... 74916 45854 23080]
 [80185 60908 78810 ... 77534  2284 62715]]


In [40]:
def recommend(game):
    game_index = df1[df1['Name'] == game].index[0]
    similar_games_indices = indices[game_index, 1:6]  
    for idx in similar_games_indices:
        print(df1.iloc[idx].Name)

In [41]:
df1.head()

Unnamed: 0,AppID,Name,About the game
0,20200,galactic bowling,galactic bowling is an exaggerated and stylize...
1,655370,train bandit,the law looks to be a showdown atop a train th...
2,1732930,jolt project,jolt project the army now has a new robotics p...
3,1355720,henosis,henosis is a mysterious 2d platform puzzler wh...
4,1139950,two weeks in painland,about the game play as a hacker who has arrang...


In [42]:
recommend("galactic bowling")

multiversus
tetris effect connected
wonder wickets
vtree beach volleyball
magical strings


In [43]:
pickle.dump(df1.to_dict(),open('game_dict.pkl','wb'))   

In [44]:
from joblib import dump

dump(ann_model, 'ann_model.joblib')          # Save the trained NearestNeighbors model
dump(indices, 'indices.joblib')              # Save the precomputed nearest neighbors indices

['indices.joblib']

In [45]:
dump(cv, 'count_vectorizer.joblib')  # Save the CountVectorizer model
dump(svd, 'svd.joblib')               # Save the TruncatedSVD model

['svd.joblib']

# DONE