In [39]:
import pandas as pd
import numpy as np
import scipy as sp
import os
from scipy import sparse
import pickle
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [40]:
df = pd.read_csv('../data\games.csv')
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [41]:
df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [42]:
#df.date.unique()
df['Release date'] = pd.to_datetime(df['Release date'], format='%b %d, %Y', errors='coerce')
select_cond = df['Peak CCU'] > 0
df = df[select_cond]
# df_recency = df.sort_values(by='Release date')
# df.head()

In [43]:
df.isna().sum()

AppID                             0
Name                              0
Release date                     47
Estimated owners                  0
Peak CCU                          0
Required age                      0
Price                             0
DLC count                         0
About the game                   24
Supported languages               0
Full audio languages              0
Reviews                       16461
Header image                      0
Website                        8047
Support url                    8325
Support email                  3947
Windows                           0
Mac                               0
Linux                             0
Metacritic score                  0
Metacritic url                17946
User score                        0
Positive                          0
Negative                          0
Score rank                    20908
Achievements                      0
Recommendations                   0
Notes                       

In [44]:
df.shape

(20926, 39)

In [45]:
percent = .90
column_percentages = df.count() / len(df)

df = df.dropna(axis=1, thresh=len(df) * percent)
df.shape

(20926, 32)

In [46]:
df.isna().sum()

AppID                            0
Name                             0
Release date                    47
Estimated owners                 0
Peak CCU                         0
Required age                     0
Price                            0
DLC count                        0
About the game                  24
Supported languages              0
Full audio languages             0
Header image                     0
Windows                          0
Mac                              0
Linux                            0
Metacritic score                 0
User score                       0
Positive                         0
Negative                         0
Achievements                     0
Recommendations                  0
Average playtime forever         0
Average playtime two weeks       0
Median playtime forever          0
Median playtime two weeks        0
Developers                       0
Publishers                      68
Categories                     305
Genres              

In [47]:
df.dropna(inplace=True)
df.shape

(17728, 32)

In [48]:
column_object = df.dtypes[df.dtypes == 'object'].keys()

In [49]:
multi_label_columns = ['Genres','Supported languages','Full audio languages','Categories']
column_object = column_object.drop(multi_label_columns)
column_object

Index(['Name', 'Estimated owners', 'About the game', 'Header image',
       'Developers', 'Publishers', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [50]:
li = list(df.dtypes[df.dtypes == 'object'].keys())
li

['Name',
 'Estimated owners',
 'About the game',
 'Supported languages',
 'Full audio languages',
 'Header image',
 'Developers',
 'Publishers',
 'Categories',
 'Genres',
 'Tags',
 'Screenshots',
 'Movies']

In [51]:
# df['Categories'] = df['Categories'].str.split(',')
# df['Genres'] = df['Genres'].str.split(',')
df.isna().sum()

AppID                         0
Name                          0
Release date                  0
Estimated owners              0
Peak CCU                      0
Required age                  0
Price                         0
DLC count                     0
About the game                0
Supported languages           0
Full audio languages          0
Header image                  0
Windows                       0
Mac                           0
Linux                         0
Metacritic score              0
User score                    0
Positive                      0
Negative                      0
Achievements                  0
Recommendations               0
Average playtime forever      0
Average playtime two weeks    0
Median playtime forever       0
Median playtime two weeks     0
Developers                    0
Publishers                    0
Categories                    0
Genres                        0
Tags                          0
Screenshots                   0
Movies  

In [52]:
tf = TfidfVectorizer()
text_data_about = df['About the game'].astype(str)
text_data_genre = df['Genres'].astype(str)
text_data_categories = df['Categories'].astype(str)
text_data_developers = df['Developers'].astype(str)
text_data_title = df['Name'].astype(str)
text_data = text_data_about + ' ' + text_data_genre + ' ' + text_data_developers + ' ' +  text_data_categories + ' ' + text_data_title
tfidf_matrix = tf.fit_transform(text_data)
similarity_matrix = cosine_similarity(tfidf_matrix,tfidf_matrix)


In [53]:
# #so dataframe is already saved in data, we good on that front
# #saving tfidf matrix and similarity matrix now

# sparse.save_npz("../data/tfidif_matrix.npz", tfidf_matrix)
# #your_matrix_back = sparse.load_npz("yourmatrix.npz")

# type(similarity_matrix)

In [54]:
# # type(similarity_matrix)
# np.save("../data/similarity_matrix.npz", similarity_matrix)

In [59]:
with open('../data/tf_vectorizer.pk1', 'wb') as file:
    pickle.dump(tf,file)

In [18]:

# df_reset = df.reset_index(drop=True)
# df_reset.to_csv("../data/cosine.csv")

In [35]:
def CosineGameRecommended(gamename:str, tfidf_vectorizer, similarity_matrix, df, recommended_games:int=5):
    # Reset the index to ensure it is continuous
    df_reset = df.reset_index(drop=True)

    # Combine text data from 'About the game' and 'Genres'
    text_data_combined = df_reset['About the game'].astype(str) + ' ' + df_reset['Genres'].astype(str) + ' ' + df_reset['Categories'].astype(str)  + ' ' + df_reset['Developers'].astype(str)  + ' ' + df_reset['Name'].astype(str) 

    # Transform the combined text data into a TF-IDF vector
    game_tfidf_vector = tfidf_vectorizer.transform([text_data_combined[df_reset['Name'] == gamename].values[0]])

    # Calculate cosine similarity for the given game vector
    similarity_scores = cosine_similarity(game_tfidf_vector, tfidf_matrix).flatten()

    # Get the indices of games with the highest similarity scores
    top_indices = np.argsort(similarity_scores)[-recommended_games-1:-1][::-1]

    # Retrieve corresponding games using the reset index
    top_games = df_reset.loc[top_indices, 'Name'].tolist()

    return pd.DataFrame(top_games)

In [36]:
recommendations = CosineGameRecommended('PUBG: BATTLEGROUNDS', tf, similarity_matrix, df)
print("Top Recommendations:", recommendations)