In [2]:
pip install lightfm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
     ------------------------------------ 310.1/310.1 kB 399.9 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'





Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py): started
  Building wheel for lightfm (setup.py): finished with status 'done'
  Created wheel for lightfm: filename=lightfm-1.16-cp39-cp39-win_amd64.whl size=421262 sha256=95edd7c5b07faec896bea7096edd6d81b21b12d572ee0dd75f9bb9592540e919
  Stored in directory: c:\users\pc\appdata\local\pip\cache\wheels\b5\0d\a5\aed8bf2edc20773ee85f7eeb108d6babf2732675c0d6170aaa
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [2]:
pip install psycopg2

Defaulting to user installation because normal site-packages is not writeable
Collecting psycopg2
  Downloading psycopg2-2.9.5-cp39-cp39-win_amd64.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 550.8 kB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.5
Note: you may need to restart the kernel to use updated packages.




In [3]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine


def create_sqlalchemy_connection(conn_str_file):
    sqlalchemy_conn_str = open(conn_str_file,'r').read()
    sqlalchemy_conn = create_engine(sqlalchemy_conn_str)
    return sqlalchemy_conn

def query_best_parameters(conn_str_file, model_number=None):
    sqlalchemy_conn = create_sqlalchemy_connection(conn_str_file)
    if model_number != None:
        best_parameters = pd.read_sql('SELECT * FROM validation_metrics WHERE model_number = {} ORDER BY auc DESC LIMIT 1'.format(str(model_number)), 
                                  sqlalchemy_conn).to_dict(orient='records')[0]
    if model_number == None:
        best_parameters = pd.read_sql('SELECT * FROM validation_metrics ORDER BY auc DESC LIMIT 1'.format(str(model_number)), 
                                  sqlalchemy_conn).to_dict(orient='records')[0]
    del best_parameters['index']
    del best_parameters['auc']
    del best_parameters['epochs']
    del best_parameters['model_number']
    del best_parameters['free']
    return best_parameters

In [4]:
import json
import pickle as pkl
import operator
import time
from collections import Counter
from itertools import product
import random

import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank

"""%run '../lib/cookbook/recsys.py'
%run '../lib/cookbook/generic_preprocessing.py'
%run '../lib/utility.py'"""

from IPython.display import HTML ## Setting display options for Ipython Notebook



In [8]:
import json

def eval_and_redump(text):
    file = open('../data/'+text+'.json','r',encoding="utf8")
    write = open('../data/'+text+'_fixed.json','w',encoding="utf8")
    array = []
    for line in file:
        line_dict = eval(line)
        array.append(line_dict)
    json.dump(array,write)
    return None


eval_and_redump('australian_user_reviews')
eval_and_redump('australian_users_items')
eval_and_redump('bundle_data')
eval_and_redump('steam_new')
eval_and_redump('steam_games')


# Data Preprocessing

In [5]:
def filter_by_hours_played(path,minutes_played=None,percentile=None):
    with open(path) as f:
        df_items = json.load(f)    
    parsed_items = json_normalize(data=df_items, 
                                  record_path='items', 
                                  meta=['items_count','steam_id','user_id','user_url'])
    if minutes_played != None:
        parsed_items = parsed_items[parsed_items.playtime_forever > minutes_played]
    if percentile != None:
        quantiles = parsed_items.groupby('item_name')['playtime_forever'].quantile(q=percentile)
        parsed_items = pd.merge(parsed_items, pd.DataFrame(quantiles), how='left', left_on='item_name', right_index=True)
        parsed_items = parsed_items.loc[parsed_items['playtime_forever_x'] > parsed_items['playtime_forever_y']]
    return parsed_items

In [6]:
def isNaN(num):
   return num != num

def cleanup_price(price):
   if isinstance(price, str):
       if price.find("Free") > 0 or price.find("Demo") > 0:
           return 0
   elif isNaN(price) == True:
       return 0
   else:
       return round(price)

In [7]:
def build_list(df, games, free_games=False):
    game_ids = []
    if free_games == False:
        game_ids = []
        for i in games:
            if 'price' in i.keys() and 'id' in i.keys():
                price = cleanup_price(i['price'])
                if price != None and price > 0:
                   game_ids.append(i['id'])
        df = df.loc[df['item_id'].isin(game_ids)]
        user_items = df[['user_id','item_name']].values.tolist()
    if free_games == True:
        for i in games:
            if 'id' in i.keys():
                game_ids.append(i['id'])
        df = df.loc[df['item_id'].isin(game_ids)]
        user_items = df[['user_id','item_name']].values.tolist()
    return user_items, game_ids

In [8]:
def build_df(user_item):
    df = pd.DataFrame(user_item[0],columns=['user','item'])
    df = df.drop_duplicates(['user','item'])
    df['own'] = 1
    df = df.pivot(index='user',columns='item',values='own')
    df = df.fillna(0)
    return df

In [9]:
## Filter by most owned games

def filter_top_n(user_item_df, n=1000):
    top_n = user_item_df.sum().nlargest(n).index
    user_top_ngames = user_item_df[top_n].stack().reset_index()
    user_top_ngames = user_top_ngames.rename(columns={0:'rating'})
    return user_top_ngames

In [10]:
def list_games(user_item_df):
    games=pd.DataFrame()
    games['item']=user_item_df.item.drop_duplicates()
    return games

In [11]:
def filter_top_n_for_filtered(user_item_df, games):
    for column in games.item:
        if column not in user_item_df.columns:
            user_item_df[column] = 0.0
    return filter_top_n(user_item_df[games['item'].tolist()],10000)

In [12]:
filtered_hours_items = filter_by_hours_played('../data/australian_users_items_fixed.json', minutes_played=30)
filtered_hours_0_items = filter_by_hours_played('../data/australian_users_items_fixed.json', minutes_played=0)
filtered_percentile_items = filter_by_hours_played('../data/australian_users_items_fixed.json', percentile=0.10)
full_aussie_items = filter_by_hours_played('../data/australian_users_items_fixed.json')
aussie_reviews = json.load(open('../data/australian_user_reviews_fixed.json','r'))
steam_games = json.load(open('../data/steam_games_fixed.json','r'))

  parsed_items = json_normalize(data=df_items,
  parsed_items = json_normalize(data=df_items,
  parsed_items = json_normalize(data=df_items,
  parsed_items = json_normalize(data=df_items,


### All Games

In [13]:
filtered_hours_user_item = build_list(filtered_hours_items, steam_games, free_games=True)
filtered_hours_0_user_items = build_list(filtered_hours_0_items, steam_games, free_games=True)
filtered_percentile_user_item = build_list(filtered_percentile_items, steam_games, free_games=True)
full_user_item = build_list(full_aussie_items, steam_games, free_games=True)

In [14]:
filtered_hours_user_item_df = build_df(filtered_hours_user_item)
filtered_hours_0_user_item_df = build_df(filtered_hours_0_user_items)
filtered_percentile_user_item_df = build_df(filtered_percentile_user_item)
full_user_item_df = build_df(full_user_item)

In [15]:
user_top_games = filter_top_n(full_user_item_df,1000)
games = list_games(user_top_games)
user_top_games_filtered_hours = filter_top_n_for_filtered(filtered_hours_user_item_df, games)
user_top_games_filtered_hours_0 = filter_top_n_for_filtered(filtered_hours_0_user_item_df, games)
user_top_games_filtered_percentile = filter_top_n_for_filtered(filtered_percentile_user_item_df, games)

In [16]:
pkl.dump(user_top_games,open('../data/preprocessed_data/all_games/user_top_games.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours,open('../data/preprocessed_data/all_games/user_top_games_filtered_hours.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours_0,open('../data/preprocessed_data/all_games/user_top_games_filtered_hours_0.pkl', 'wb'))
pkl.dump(user_top_games_filtered_percentile,open('../data/preprocessed_data/all_games/user_top_games_filtered_percentile.pkl', 'wb'))
pkl.dump(games,open('../data/preprocessed_data/all_games/games.pkl', 'wb'))

### No Free Games

In [17]:
filtered_hours_user_item = build_list(filtered_hours_items, steam_games, free_games=False)
filtered_hours_0_user_items = build_list(filtered_hours_0_items, steam_games, free_games=False)
filtered_percentile_user_item = build_list(filtered_percentile_items, steam_games, free_games=False)
full_user_item = build_list(full_aussie_items, steam_games, free_games=False)

filtered_hours_user_item_df = build_df(filtered_hours_user_item)
filtered_hours_0_user_item_df = build_df(filtered_hours_0_user_items)
filtered_percentile_user_item_df = build_df(filtered_percentile_user_item)
full_user_item_df = build_df(full_user_item)

In [18]:
user_top_games = filter_top_n(full_user_item_df,1000)
games = list_games(user_top_games)
user_top_games_filtered_hours = filter_top_n_for_filtered(filtered_hours_user_item_df, games)
user_top_games_filtered_hours_0 = filter_top_n_for_filtered(filtered_hours_0_user_item_df, games)
user_top_games_filtered_percentile = filter_top_n_for_filtered(filtered_percentile_user_item_df, games)

In [19]:
pkl.dump(user_top_games,open('../data/preprocessed_data/no_free_games/user_top_games.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours,open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours_0,open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours_0.pkl', 'wb'))
pkl.dump(user_top_games_filtered_percentile,open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl', 'wb'))
pkl.dump(games,open('../data/preprocessed_data/no_free_games/games.pkl', 'wb'))