In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, average_precision_score
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
from tensorflow.python.framework import ops
from collections import Counter
import re
import pathlib
import pickle

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
path = '/home/jewelle/data_bootcamp/LHL-final-project/data/'

df = pd.read_csv(path + 'final.csv')

In [5]:
df.head()

Unnamed: 0,userID,game,action,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
0,151603712,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
1,151603712,Fallout 4,play,87.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
2,87445402,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
3,87445402,Fallout 4,play,83.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
4,25096601,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG


In [6]:
df.loc[(df['action'] == 'purchase') & (df['hours_played'] == 1.0), 'hours_played'] = 0
df = df.sort_values(['userID', 'game', 'hours_played'])
clean_df = df.drop_duplicates(['userID', 'game'], keep = 'last')

In [7]:
clean_df.drop(['action'], axis=1, inplace=True)

In [8]:
clean_df.shape

(56137, 9)

In [9]:
clean_df.head()

Unnamed: 0,userID,game,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
51896,5250,Alien Swarm,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action
58359,5250,Counter-Strike,0.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action
59534,5250,Day of Defeat,0.0,"Very Positive,(2,022),- 86% of the 2,022 user ...",Valve,"Valve,Valve","FPS,World War II,Multiplayer,Shooter,Action,Wa...","Multi-player,Valve Anti-Cheat enabled",Action
60143,5250,Deathmatch Classic,0.0,"Very Positive,(953),- 80% of the 953 user revi...",Valve,"Valve,Valve","Action,FPS,Classic,Multiplayer,Shooter,First-P...","Multi-player,Online Multi-Player,Local Multi-P...",Action
12808,5250,Dota 2,0.2,"Very Positive,(1,015,621),- 85% of the 1,015,6...",Valve,"Valve,Valve","Free to Play,MOBA,Multiplayer,Strategy,e-sport...","Multi-player,Co-op,Steam Trading Cards,Steam W...","Action,Free to Play,Strategy"


In [10]:
#check for missing values in clean_df
total = clean_df.isnull().sum().sort_values(ascending=False)
percent = (clean_df.isnull().sum()/clean_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(6)

Unnamed: 0,Total,Percent
publisher,890,0.015854
genre,352,0.00627
developer,196,0.003491
all_reviews,178,0.003171
popular_tags,102,0.001817
game_details,80,0.001425


In [11]:
steam_clean = clean_df.dropna(how='any', subset=['publisher', 'genre', 'developer', 
                                                 'all_reviews', 'popular_tags', 'game_details'])

In [12]:
#check for missing values in steam_clean
total = steam_clean.isnull().sum().sort_values(ascending=False)
percent = (steam_clean.isnull().sum()/steam_clean.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
genre,0,0.0
game_details,0,0.0
popular_tags,0,0.0
publisher,0,0.0
developer,0,0.0


In [13]:
#steam refunds games played for less than 2 hours, so logical to remove games with fewer hours palyed than 2.0
steam_df = steam_clean[steam_clean['hours_played'] > 2.0] 

In [14]:
#also filter for games with user count >20
steam_df.groupby('game')['userID'].count()

steam_train = pd.DataFrame(steam_df[steam_df.groupby('game').userID.transform('count')>19])

In [15]:
steam_train.dtypes

userID            int64
game             object
hours_played    float64
all_reviews      object
developer        object
publisher        object
popular_tags     object
game_details     object
genre            object
dtype: object

In [16]:
isinstance(steam_train, pd.DataFrame)

True

In [17]:
steam_train.head()

Unnamed: 0,userID,game,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
51896,5250,Alien Swarm,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action
27686,5250,Portal 2,13.6,"Overwhelmingly Positive,(104,354),- 98% of the...",Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,Steam Achievements,Full co...","Action,Adventure"
34318,76767,Banished,24.0,"Very Positive,(23,931),- 89% of the 23,931 use...",Shining Rock Software LLC,"Shining Rock Software LLC,Shining Rock Softwar...","City Builder,Strategy,Simulation,Survival,Indi...","Single-player,Steam Achievements","Indie,Simulation,Strategy"
58286,76767,Counter-Strike,365.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action
27625,76767,Portal 2,15.0,"Overwhelmingly Positive,(104,354),- 98% of the...",Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,Steam Achievements,Full co...","Action,Adventure"


In [18]:
n_users = len(steam_train.userID.unique())
n_games = len(steam_train.game.unique())

print('There are {0} users and {1} games in the data'.format(n_users, n_games))

There are 6521 users and 174 games in the data


In [19]:
steam_train = steam_train.rename(columns={'popular_tags': 'tags'})

In [20]:
# remove spaces and special character from game name in dataset
for i, row in steam_train.iterrows():
    clean = re.sub('[^A-Za-z0-9]+', '', row["game"])
    clean = clean.lower()
    steam_train.at[i, 'ID'] = clean

In [21]:
def clean_data(x):
    if isinstance(x, str):
        return x.replace(" ", "")
    else:
        print(x)
        return x

In [22]:
steam_train.loc[:, 'genre'] = steam_train['genre'].apply(clean_data)
steam_train.loc[:, 'game_details'] = steam_train['game_details'].apply(clean_data)
steam_train.loc[:, 'tags'] = steam_train['tags'].apply(clean_data)
steam_train.loc[:, 'publisher'] = steam_train['publisher'].apply(clean_data)
steam_train.loc[:, 'developer'] = steam_train['developer'].apply(clean_data)

In [23]:
# create some column containing a mix of different information

steam_train["genre_publisher_developer"] = steam_train['genre'] + steam_train['publisher'] + steam_train['developer']
steam_train["genre_tags_developer"] = steam_train['genre'] + steam_train['tags'] + steam_train['developer']
steam_train["genre_tags_game_details"] = steam_train['genre'] + steam_train['tags'] + steam_train['game_details']
steam_train["genre_publisher_developer_game_details"] = steam_train['genre'] + steam_train['publisher'] + steam_train['developer'] + steam_train['game_details']

In [24]:
for i, row in steam_train.iterrows():
    if type(row["all_reviews"]) == str:

        # extract % of positive reviews
        x = re.findall(r'- [0,1,2,3,4,5,6,7,8,9]*%', row["all_reviews"])
        if len(x) != 0:
            steam_train.at[i, 'percentage_positive_review'] = x[0].translate({ord(i): None for i in '- %'})

        # extract qualification of reviews
        reviewParse = row["all_reviews"].split(",")
        if 'user reviews' in reviewParse[0]:
            steam_train.at[i, 'review_qualification'] = ""
        else:
            steam_train.at[i, 'review_qualification'] = reviewParse[0]

In [25]:
steam_train.drop(['all_reviews'], axis=1, inplace=True)

In [26]:
steam_train.head()

Unnamed: 0,userID,game,hours_played,developer,publisher,tags,game_details,genre,ID,genre_publisher_developer,genre_tags_developer,genre_tags_game_details,genre_publisher_developer_game_details,percentage_positive_review,review_qualification
51896,5250,Alien Swarm,4.9,Valve,"Valve,Valve","FreetoPlay,Co-op,Action,Multiplayer,Aliens,Onl...","Single-player,Multi-player,Co-op,SteamAchievem...",Action,alienswarm,"ActionValve,ValveValve","ActionFreetoPlay,Co-op,Action,Multiplayer,Alie...","ActionFreetoPlay,Co-op,Action,Multiplayer,Alie...","ActionValve,ValveValveSingle-player,Multi-play...",94,Very Positive
27686,5250,Portal 2,13.6,Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,SteamAchievements,Fullcont...","Action,Adventure",portal2,"Action,AdventureValve,ValveValve","Action,AdventurePuzzle,Co-op,First-Person,Sci-...","Action,AdventurePuzzle,Co-op,First-Person,Sci-...","Action,AdventureValve,ValveValveSingle-player,...",98,Overwhelmingly Positive
34318,76767,Banished,24.0,ShiningRockSoftwareLLC,"ShiningRockSoftwareLLC,ShiningRockSoftwareLLC","CityBuilder,Strategy,Simulation,Survival,Indie...","Single-player,SteamAchievements","Indie,Simulation,Strategy",banished,"Indie,Simulation,StrategyShiningRockSoftwareLL...","Indie,Simulation,StrategyCityBuilder,Strategy,...","Indie,Simulation,StrategyCityBuilder,Strategy,...","Indie,Simulation,StrategyShiningRockSoftwareLL...",89,Very Positive
58286,76767,Counter-Strike,365.0,Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,OnlineMulti-Player,LocalMulti-Pla...",Action,counterstrike,"ActionValve,ValveValve","ActionAction,FPS,Multiplayer,Shooter,Classic,T...","ActionAction,FPS,Multiplayer,Shooter,Classic,T...","ActionValve,ValveValveMulti-player,OnlineMulti...",96,Overwhelmingly Positive
27625,76767,Portal 2,15.0,Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,SteamAchievements,Fullcont...","Action,Adventure",portal2,"Action,AdventureValve,ValveValve","Action,AdventurePuzzle,Co-op,First-Person,Sci-...","Action,AdventurePuzzle,Co-op,First-Person,Sci-...","Action,AdventureValve,ValveValveSingle-player,...",98,Overwhelmingly Positive


In [27]:
listGames = steam_train['game'].unique()

listGames

array(['Alien Swarm', 'Portal 2', 'Banished', 'Counter-Strike',
       'Far Cry 3', 'Left 4 Dead 2', 'Torchlight II', 'Worms Reloaded',
       '7 Days to Die', 'BioShock Infinite', 'Borderlands 2',
       'Dishonored', "Don't Starve", 'Dungeons of Dredmor', 'Dying Light',
       'Fallout 4', "Garry's Mod", 'Hotline Miami', 'Killing Floor 2',
       'Legend of Grimrock', 'Magicka', 'Mark of the Ninja',
       'Mass Effect 2', 'Max Payne 3', 'Mortal Kombat X',
       'Natural Selection 2', 'Orcs Must Die! 2', 'PAYDAY 2',
       'Pillars of Eternity', 'Prison Architect', 'RAGE', 'Rogue Legacy',
       'Rust', 'Shadowrun Returns', 'Starbound', 'State of Decay',
       'Team Fortress 2', 'Terraria', 'The Forest', 'The Wolf Among Us',
       'Thief', 'Tomb Raider', 'Torchlight', 'Tropico 4', 'Unturned',
       'Half-Life 2', 'War Thunder', 'Alan Wake', 'DayZ',
       'Duke Nukem Forever', 'F.E.A.R. 3', 'Killing Floor', 'Left 4 Dead',
       'Mass Effect', 'Portal', 'Red Faction Guerrilla Ste

In [28]:
n_recommendation = 5

col_names = list(map(str, range(1, n_recommendation + 1)))
col_names = ['userID'] + col_names

indices = pd.Series(steam_train.index, index=steam_train['game']).drop_duplicates()

In [29]:
def get_sim_games(title, cosine_sim):

    if title not in listGames:
        return []

    # Get the index of the game that matches the name
    idx = indices[title]

    # if there's 2+ games with same name
    if type(idx) is pd.Series:
        return []

    # Get the pairwise similarity scores of all games with that game
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar games
    sim_scores = sim_scores[1:n_recommendation + 1]

    # Get the game indices
    game_indices = [i[0] for i in sim_scores]

    # Return the top most similar games
    return steam_train['game'].iloc[game_indices].tolist()

In [30]:
def make_recommendation_for_user(user_id, game_list, game_user_have):
#    if type(game_list) is not list or len(game_list) == 0:
        # return empty one
#    return pd.DataFrame(data=[[user_id] + [""] * n_recommendation], columns=col_names)

    # get reviews of game recommendation, remove the games the user already has and order them by reviews
    recommendation_reviews = steam_train.loc[steam_train['game'].isin(game_list)]
    recommendation_reviews = recommendation_reviews.loc[~recommendation_reviews['game'].isin(game_user_have)]
    recommendation_reviews = recommendation_reviews.sort_values(by="percentage_positive_review", ascending=False)

    if len(recommendation_reviews.index) < n_recommendation:
        return steam_train(data=[[user_id] + recommendation_reviews["game"].tolist() +
                               [""] * (n_recommendation - len(recommendation_reviews.index))],
                         columns=col_names)
    else:
        return steam_train(data=[[user_id] + recommendation_reviews["game"].tolist()[0:n_recommendation]],
                         columns=col_names)

In [31]:
def generate_recommendation(column_name, location_output_file):
    recommendationByUserData = list(steam_train.columns.values)
    print (recommendationByUserData)

    # Compute the Cosine Similarity matrix using the column
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(steam_train[column_name])
    cosine_sim_matrix = cosine_similarity(count_matrix, count_matrix)

    previousId = ""
    listSuggestion = list()
    listGamesUserHas = list()

    # loop on all rows and get recommendations for user
    for j, row in steam_train.iterrows():
        if previousId != row["userID"]:
            previousId = row["userID"]
            recommendationByUserData = pd.concat([recommendationByUserData,
                                               make_recommendation_for_user(previousId, listSuggestion, listGamesUserHas)],
                                              ignore_index=True)
            listSuggestion = list()
            listGamesUserHas = list()
        listGamesUserHas.extend([row["game"]])
        listSuggestion.extend(get_sim_games(row["game"], cosine_sim_matrix))

    # add the last element for the last user
    recommendationByUserData = pd.concat([recommendationByUserData,
                                       make_recommendation_for_user(previousId, listSuggestion, listGamesUserHas)],
                                      ignore_index=True)

    recommendationByUserData.to_csv(location_output_file, index=False)

In [None]:
generate_recommendation('tags', path + 'content_based_recommender_output_tags.csv')

['userID', 'game', 'hours_played', 'developer', 'publisher', 'tags', 'game_details', 'genre', 'ID', 'genre_publisher_developer', 'genre_tags_developer', 'genre_tags_game_details', 'genre_publisher_developer_game_details', 'percentage_positive_review', 'review_qualification']


In [None]:
from sklearn.metrics.pairwise import linear_kernel

tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(steam_train['tags'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

In [88]:
for idx, row in steam_train.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], steam_train['game'][i]) for i in similar_indices]

    results[row['game']] = similar_items[1:]
    
print('done!')

Index(['userID', 'game', 'hours_played', 'developer', 'publisher', 'tags',
       'game_details', 'genre', 'ID', 'genre_publisher_developer',
       'genre_tags_developer', 'genre_tags_game_details',
       'genre_publisher_developer_game_details', 'percentage_positive_review',
       'review_qualification'],
      dtype='object')

In [1]:
def game(id):
    return steam_train.loc[steam_train['game'] == game]['tags'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(game_id, num):
    print("Recommending " + str(num) + " products similar to " + game(game_id) + "...")
    print("-------")
    recs = results[game_id][:num]
    for rec in recs:
        print("Recommended: " + game(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(game_id=11, num=5)

NameError: name 'steam_train' is not defined