In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from string import printable

# Load Data

In [2]:
main = pd.read_csv('steam_clean.csv')
#main.head()

In [3]:
desc = pd.read_csv('steam_description_data_clean.csv')
#desc.head(3)

In [4]:
#rename the ID column so I can merge on the same id
main.rename(columns={'appid':'steam_appid'}, inplace=True)

In [5]:
#merge the two dataset on the steam_appid
dataset = pd.merge(main, desc, on='steam_appid')

In [6]:
dataset.head(3)

Unnamed: 0,steam_appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,detailed_description,about_the_game,short_description
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,124534,3339,17612,317,10000000-20000000,7.19,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,3318,633,277,62,5000000-10000000,3.99,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,3416,398,187,34,5000000-10000000,3.99,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...


In [7]:
#create a PD series with all text data in one column
#X = pd.Series(dataset[['name','platforms', 'categories', 'genres', 'detailed_description', 'about_the_game']].fillna('').values.tolist()).str.join(' ')

# Data preprocess

In [8]:
X = dataset[['name','platforms', 'categories', 'genres', 'detailed_description', 'about_the_game']]

#replace all ; with a blank space
X['categories'] = X['categories'].str.replace(';',' ')
X['platforms'] = X['platforms'].str.replace(';',' ')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [9]:
cols = ['platforms', 'categories', 'genres', 'detailed_description', 'about_the_game']

#creating a new column with all data
X['all_data'] =  X[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
#X['all_data'][0]

# Create model

In [11]:
all_data = X['all_data']

In [12]:
vector_words = TfidfVectorizer(stop_words='english')

In [13]:
tfidf_matrix = vector_words.fit_transform(all_data)

In [14]:
#tfidf_matrix.shape

In [15]:
#print(vector_words.get_feature_names())

In [16]:
#using linear_kernel to calculate the Cosine similarity between two vectors
sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [17]:
#create a new Series where the index is the title of the game
indices = pd.Series(X.index, index=X['name']).drop_duplicates()

In [18]:
def get_recommendation(name, sim=sim, indices=indices):
    
    #get the id for the game title
    game_id = indices[name]
    
    #create a new list where the similarity scores for a game is saved, the list contains the
    #scores for the game we asked for.
    sim_scores = list(enumerate(sim[game_id]))
    
    #sort the list
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Takes the top 10 games in the list
    sim_scores = sim_scores[1:10]
    
    #get the id of the (10)games that has the best similarity scores
    game_indices = [i[0] for i in sim_scores]
    
    #return the name of the games
    return indices.iloc[game_indices]
    

In [19]:
get_recommendation("Fallout 3")

name
Fallout 3: Game of the Year Edition            481
Fallout Shelter                              13197
Fallout 4                                     5916
Dungeons & Dragons: Chronicles of Mystara     1618
H.I.S.T.O.R.Y T.O.R.C.H.K.A                  10758
Wasteland 2: Director's Cut                   1794
Subspace Continuum                            4974
StarShip Constructor                         12507
NaN                                          20411
dtype: int64