In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import spatial
from tabulate import tabulate

def get_distance(game1, game2):
    #Take array index 0 to 29 values for genres_similarity since these are genres.
    #List starts with index 0, so we are taking 0 to 29 since 29th will not be taken 
    genres_similarity = spatial.distance.cosine(game1[:29], game2[:29])
    
    #Take 29 to 58 values for categories_similarity since these are categories
    categories_similarity = spatial.distance.cosine(game1[29:58], game2[29:58])

    #Take 58 to 397 values for Steam Tags
    tags_similarity = spatial.distance.cosine(game1[58:397], game2[58:397])

    #Custom indices (TEI, PAI, ACI, GNI) parameters
    time_engaged_similarity = spatial.distance.cosine(game1[397:407], game2[397:407])
    player_acclaim_similarity = spatial.distance.cosine(game1[407:417], game2[407:417])
    adopters_choice_similarity = spatial.distance.cosine(game1[417:427], game2[417:427])
    game_nexus_similarity = spatial.distance.cosine(game1[427:437], game2[427:437])
    
    # Apply weights
    weighted_distance = (0.05 * tags_similarity) + (0.3 * genres_similarity) + (0.05 * categories_similarity) + (0.6 * game_nexus_similarity)

    return weighted_distance

# --------------------------------------- Recommender --------------------------------------- #
#Load dataframe
df = pd.read_csv('formattedSteamGames.csv')

# Splitting categories and genres into lists
df['categories'] = df['categories'].fillna('').str.split(';')
df['genres'] = df['genres'].fillna('').str.split(';')
df['steamspy_tags'] = df['steamspy_tags'].fillna('').str.split(';')

# Creating one-hot encoded vectors for categories and genres
mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(mlb.fit_transform(df['categories']), columns=mlb.classes_, index=df.index)
genres_encoded = pd.DataFrame(mlb.fit_transform(df['genres']), columns=mlb.classes_, index=df.index)
steamspy_tags_encoded = pd.DataFrame(mlb.fit_transform(df['steamspy_tags']), columns=mlb.classes_, index=df.index)
time_engaged_category_encoded = pd.get_dummies(df['time_engaged_category'], prefix='tec', dtype=int)
player_acclaim_category_encoded = pd.get_dummies(df['player_acclaim_category'], prefix='pac', dtype=int)
adopters_choice_category_encoded = pd.get_dummies(df['adopters_choice_category'], prefix='acc', dtype=int)
game_nexus_category_encoded = pd.get_dummies(df['game_nexus_category'], prefix='gnc', dtype=int)

# Concatenating encoded columns with original DataFrame
df_encoded = pd.concat([df['name'], categories_encoded, genres_encoded, steamspy_tags_encoded, time_engaged_category_encoded, player_acclaim_category_encoded, adopters_choice_category_encoded, game_nexus_category_encoded], axis=1)

# Creating feature vectors by combining encoded categories and genres
feature_vectors = pd.concat([categories_encoded, genres_encoded, steamspy_tags_encoded, time_engaged_category_encoded, player_acclaim_category_encoded, adopters_choice_category_encoded, game_nexus_category_encoded], axis=1)

# Instantiate and fit the KNN model
knn_model = NearestNeighbors(n_neighbors=11, metric=get_distance)
knn_model.fit(feature_vectors.values)


# Function to get recommendations for a given game name
def get_recommendations(game_name):

    #For the game name we passed to function, we will find it's index from DataFrame
    game_index = df[df['name'] == game_name].index[0]

    #Get distances and indices for selected game index.
    distances, indices = knn_model.kneighbors([feature_vectors.iloc[game_index]])
     
    #Get simillar game names and exclude the input game itself
    similar_game_names = df.iloc[indices[0][1:]]['name'].astype(str).values.tolist()
    
    #Get simillar game distances and exclude the input game itself
    similar_game_distances = distances[0][1:].tolist()

    #Create list of distances and game names which will be used in response to create table
    distancesAndGameNames = [similar_game_distances, similar_game_names]

    return distancesAndGameNames


In [2]:
# Enter game name and get recommendation.
# TODO: Create that user can enter game instead of this 
selected_game = "Dota 2"
returnedGamesAndDistances = get_recommendations(selected_game)

#Format response in table format.
formattedResultList = []
for similarity, name in zip(returnedGamesAndDistances[0], returnedGamesAndDistances[1]):
    formattedResultList.append([name, "{:.2f}".format((1 - similarity) * 100)])

#Print similar games
print(tabulate(formattedResultList, headers=['Game Name', 'Similarity  (%)'], tablefmt="outline"))

+----------------------------------+-------------------+
| Game Name                        |   Similarity  (%) |
| Line of Sight                    |             84.6  |
| Warframe                         |             84.27 |
| Counter-Strike: Global Offensive |             84.13 |
| Dropzone                         |             83.7  |
| Atlas Reactor                    |             82.63 |
| Creativerse                      |             82.61 |
| Killing Floor 2                  |             82.53 |
| Awesomenauts - the 2D moba       |             82.36 |
| Insurgency                       |             82.23 |
| Team Fortress 2                  |             82.12 |
+----------------------------------+-------------------+


# Calculate Diversity

In [None]:
# Create an empty utility matrix with the same index and columns as our feature_vectors
num_games = len(feature_vectors)
utility_matrix = pd.DataFrame(index=feature_vectors.index, columns=feature_vectors.index)

In [None]:
# Iterate through each pair of games and calculate the distance using get_distance function
# NOTE: This job is very difficult and time consuming for running. Run it on some cloud provider
for i in range(num_games):
    for j in range(num_games):
        if i != j:
            game1 = feature_vectors.iloc[i].values
            game2 = feature_vectors.iloc[j].values

            # Calculate distance using your get_distance function
            dist = get_distance(game1, game2)

            # Assign the distance to the utility matrix
            utility_matrix.iloc[i, j] = dist

In [None]:
#Calculate diversity
diversity = utility_matrix.mean().mean()
print(diversity)