In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

#Load dataframe
df = pd.read_csv('formattedSteamGames.csv')

# Splitting categories and genres into lists
df['categories'] = df['categories'].fillna('')
df['genres'] = df['genres'].fillna('')
df['steamspy_tags'] = df['steamspy_tags'].fillna('')

df['combined_features'] = df['categories'].fillna('') + ';' + df['genres'].fillna('') + ';' + df['steamspy_tags'].fillna('')

# Initialize the TfidfVectorizer with a custom tokenizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=lambda x: x.split(';'), token_pattern=None)
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Calculate the cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [2]:
def get_recommendations(game_name):
    #Get the index of the game that matches the title
    game_index = df[df['name'] == game_name].index[0]

    #Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[game_index]))

    #Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    #Get Game Names and Game Distances separately since we will need data in that format
    gameNames = []
    gameDistances = []
    for name, distance in sim_scores[1:11]:
        gameNames.append(df['name'][name])
        gameDistances.append(distance)

    #Create list of distances and game names which will be used in response to create table
    distancesAndGameNames = [gameDistances, gameNames]

    return distancesAndGameNames

In [3]:
# Enter game name and get recommendation.
# TODO: Create that user can enter game instead of this 
input_game = "Dota 2"
returnedGamesAndDistances = get_recommendations(input_game)

#Format response in table format.
formattedResultList = []
for similarity, name in zip(returnedGamesAndDistances[0], returnedGamesAndDistances[1]):
    formattedResultList.append([name, "{:.2f}".format((similarity) * 100)])

#Print similar games
print(tabulate(formattedResultList, headers=['Game Name', 'Similarity  (%)'], tablefmt="outline"))

+-----------------------------------+-------------------+
| Game Name                         |   Similarity  (%) |
| Awesomenauts - the 2D moba        |             67.17 |
| Games of Glory                    |             67.12 |
| Strife®                           |             64.68 |
| Prime World                       |             64.15 |
| Iron League                       |             63.47 |
| Heroes of SoulCraft - Arcade MOBA |             61.83 |
| SMITE®                            |             60.66 |
| Vainglory                         |             60.08 |
| Bloodline Champions               |             56.21 |
| Immortal Empire                   |             56.02 |
+-----------------------------------+-------------------+


# Calculate Diversity

In [None]:
#Utility matrix calculated 
formattedSimilaritiesDf = pd.DataFrame(cosine_sim, columns=df['name'], index=df['name']).reset_index()

In [None]:
#Diversity
#NOTE: We are using 1-cosine_sim in this case since we calculated cosine_similarity and not the cosine distance
diversity = 1 - cosine_sim.mean()
print("Calculated diversity: " + str(diversity))

Calculated diversity: 0.7982115286979914
