CONTENT-BASED FILTERING

Using Cosine Similarity

In [1]:
import re
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
games_df = pd.read_csv('datasets/training_set.csv')

games_df

Unnamed: 0,app_id,title,positive_ratio,year,description_with_tags,os_label
0,13500,prince of persia warrior within,84,2008,enter the dark underworld of prince of persia ...,0
1,22364,brink agents of change,85,2011,action,0
2,113020,monaco what s yours is mine,92,2013,monaco what s yours is mine is a single player...,6
3,226560,escape dead island,61,2014,escape dead island is a survival mystery adven...,0
4,249050,dungeon of the endless,88,2014,dungeon of the endless is a rogue like dungeon...,3
...,...,...,...,...,...,...
49638,2455060,taboo trial,94,2023,in the rogue action game taboo trial you will ...,0
49639,1138640,hometopia,61,2023,build better together hometopia is a seriously...,0
49640,2515460,northgard kernev clan of the stoat,67,2023,strategy indie simulation,6
49641,1687000,fading afternoon,79,2023,seiji maruyama is a middle aged yakuza recentl...,0


In [3]:
# Separate features
text_data = games_df['description_with_tags']
numeric_data = games_df[['positive_ratio', 'year', 'os_label']]

In [4]:
# preprocess text features using tf-idf
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.01, max_features=10000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(text_data)

In [5]:
# preprocess numerical features - standard scaling and then convert to sparse matrix
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(numeric_data)
scaled_numeric_sparse = csr_matrix(scaled_numeric)

In [6]:
# Combine all features in horizontal stack
combined_features = hstack([tfidf_matrix, scaled_numeric_sparse])

In [7]:
print(tfidf_matrix.shape)

(49643, 555)


In [8]:
print(scaled_numeric_sparse.shape)

(49643, 3)


In [9]:
# Convert the sparse matrix to a dense matrix and then to a TensorFlow tensor
tfidf_dense = tfidf_matrix.toarray()
tfidf_tensor = tf.convert_to_tensor(tfidf_dense, dtype=tf.float32)

In [10]:
# Compute cosine similarity
def cosine_similarity_tf(tfidf_tensor):
    # Normalize the tf-idf matrix (L2 normalization)
    norm = tf.norm(tfidf_tensor, axis=1, keepdims=True)
    tfidf_normalized = tfidf_tensor / norm

    # Compute cosine similarity matrix
    similarity_matrix = tf.matmul(tfidf_normalized, tfidf_normalized, transpose_b=True)

    return similarity_matrix

In [11]:
# Run the similarity computation
with tf.device('/GPU:0'):  # Use GPU if available
    similarity_matrix = cosine_similarity_tf(tfidf_tensor)

print(similarity_matrix.shape)

(49643, 49643)


USER INPUT 

For testing, pick a game from the datasets/output.csv and load it in the input_title

In [15]:
games_df_main = pd.read_csv('datasets/output.csv')

# Input game annd index
input_title = "RIOT: Civil Unrest"  # USER INPUT GOES HERE
input_index = games_df_main[games_df_main['title'] == input_title].index[0]

print(input_index)

55


In [16]:
# Retrieve similarity scores
similarity_scores = similarity_matrix[input_index]

Results of Top 5 Similar Games to your input game

In [17]:
# Get top N similar games (excluding the input game itself)
N = 5
top_values, top_indices = tf.math.top_k(similarity_scores, k=N+1)  # Get N+1 to skip the first one (self)
top_indices = top_indices[1:]  # Skip the first index (self)

# Convert indices to numpy for indexing into pandas dataframe
top_indices_np = top_indices.numpy()

# Get details of top similar games from your games dataframe
top_similar_games = games_df_main.iloc[top_indices_np]

# Print the titles of the top similar games
print("TOP 5 SIMILAR GAMES:")
print(top_similar_games['title'])

print()
print('------------------------------------------------------------------------------------------------------')
print('SIMILARITY SCORES FOR TOP 5 SIMILAR GAMES:')
for idx in range(N):
    app_id = top_indices[idx].numpy()
    print(f"Item index: {app_id} \nGame: {games_df_main.iloc[app_id]['title']} \nSimilarity score: {top_values[idx].numpy()}")
    print()
    



TOP 5 SIMILAR GAMES:
17470                  Prison Architect - Undead
13168           Prison Architect - Perfect Storm
37689    Prison Architect - Cleared For Transfer
1013                 Tank Warfare: Longstop Hill
34993                       Battlefleet Engineer
Name: title, dtype: object

------------------------------------------------------------------------------------------------------
SIMILARITY SCORES FOR TOP 5 SIMILAR GAMES:
Item index: 17470 
Game: Prison Architect - Undead 
Similarity score: 1.0

Item index: 13168 
Game: Prison Architect - Perfect Storm 
Similarity score: 0.607552170753479

Item index: 37689 
Game: Prison Architect - Cleared For Transfer 
Similarity score: 0.5513043403625488

Item index: 1013 
Game: Tank Warfare: Longstop Hill 
Similarity score: 0.5513043403625488

Item index: 34993 
Game: Battlefleet Engineer 
Similarity score: 0.525551974773407

