In [1]:
import time
import os
import zipfile
import csv
import requests
import json
from itertools import islice

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split 

from scipy.sparse import coo_matrix as sp

import warnings # warings 제거
warnings.filterwarnings(action='ignore')



In [2]:
# 초기설정시 기본 환경설정

In [3]:
# 데이터 로드 (type : dataframe)

In [4]:
with open("D:/steam_data/games/steam_dataset/appinfo/store_data/steam_store_data.json", 'r') as game_data:
    game_data = pd.read_json(game_data)

In [5]:
with open('parsed_data_overten_reset.csv', 'r') as review_data:
    review_data = pd.read_csv(review_data)

In [6]:
with open('score_data_overten_pos_neg.csv', 'r') as score_data: # ndcg
    score_data = pd.read_csv(score_data)

# 1. Data structure and embedding




> ## Data loading, preprocessing




**user_features** : user의 특성(예시: 성별, 지역, 선호유형)

**item_features**: item의 특성(예시: 제목, 가격, 장르, 국가, 게시 시간)

**interactions** : user-item간의 data(평점)

> ## 데이터프레임 형태 변형 및 확인

In [7]:
game_data = game_data.transpose() # 행렬 뒤집기

In [8]:
game_data = game_data[['steam_appid', 'name']] 
review_data = review_data[['author', 'appid', 'playtime_forever', 'playtime_at_review', 'voted_up', 'votes_up']] 

In [9]:
# 변수명 명시적으로 변경함

ratings = review_data 
games = game_data 

In [10]:
ratings['author'] = ratings['author'].astype('str') # ratings['author']의 타입이 numpy.int64 이므로 str로 변환

In [11]:
ratings

Unnamed: 0,author,appid,playtime_forever,playtime_at_review,voted_up,votes_up
0,76561198976638577,10,104.0,10,True,0
1,76561198391972225,10,426.0,426,True,1
2,76561198414408167,10,386.0,199,True,0
3,76561198376936422,10,165.0,184,True,0
4,76561198905733625,10,28.0,40,True,0
...,...,...,...,...,...,...
14916131,76561198284162593,999660,21.0,21,False,4
14916132,76561198348027001,999660,291.0,200,True,0
14916133,76561198250100526,999660,117.0,27,True,0
14916134,76561198847043627,999660,606.0,99,True,2




> ## Data embedding
위의 데이터를 embedding dataset으로 build



In [12]:
# 데이터프레임 중복 확인

In [13]:
users = ratings[['author']]
users.drop_duplicates(inplace = True)

In [14]:
users

Unnamed: 0,author
0,76561198976638577
1,76561198391972225
2,76561198414408167
3,76561198376936422
4,76561198905733625
...,...
11141226,76561198250534406
11158103,76561198061690430
11186956,76561198363670762
11223767,76561198356586449


In [15]:
ratings.drop_duplicates(inplace = True) # 14916136 -> 13828950로 중복 행 제거

In [16]:
items = ratings[['appid']]
items.drop_duplicates(inplace = True)

In [17]:
items.shape[0] # 총 게임 개수는 51927인데 반해 리뷰에 있는 게임 개수는 11566(따라서 interactions의 item 개수는 11566)

11566

In [18]:
games.drop_duplicates(inplace = True) # 51936 -> 51927로 중복 행 제거

In [19]:
# user-item 사이의 interactions matrix 생성

In [20]:
playtime_summed = ratings.groupby('author')["playtime_forever"].sum()

In [21]:
playtime_summed.reset_index()

Unnamed: 0,author,playtime_forever
0,76561197960267615,36960.0
1,76561197960267984,116432.0
2,76561197960268765,240727.0
3,76561197960269149,141111.0
4,76561197960269155,45904.0
...,...,...
746238,76561199370201573,10356.0
746239,76561199372600134,2595.0
746240,76561199374305625,12221.0
746241,76561199375222483,5588.0


In [22]:
games = pd.read_csv("game_data.csv")

In [23]:
unique_games = games.set_index('appid').loc[items['appid']]
unique_games = unique_games.reset_index()

In [24]:
item_meta = unique_games[['appid', 'game_names', 'required_age', 'is_free', 'developers', 'publishers', 'release_date', 'recommendations']]
#item_meta = games
item_meta[item_meta.columns[1:]].values.flatten()

array(['Counter-Strike', 0, False, ..., 'SNK CORPORATION', 2020, 143],
      dtype=object)

In [25]:
dataset = Dataset()
dataset.fit(users=users['author'], items=items['appid'], user_features = ["playtime_forever"],
            item_features = item_meta[item_meta.columns[1:]].values.flatten())

In [26]:
ratings_source = ratings[['author', 'appid']]
ratings_source.drop_duplicates(inplace = True)
ratings_source = ratings_source.values.tolist()

In [27]:
interactions, weights = dataset.build_interactions(ratings_source)

In [28]:
# user_features 빌드

In [29]:
u_ = playtime_summed.reset_index()['author']
f_ = playtime_summed.reset_index()['playtime_forever']
user_features_source = [[u, f] for u, f in zip(u_, f_)]

In [30]:
user_features_source = [[i[0], {"playtime_forever":i[1]}] for i in user_features_source]

In [31]:
user_features=dataset.build_user_features(user_features_source)

In [32]:
item_features_source = [(item_meta['appid'][i],
                        {"game_names":item_meta['game_names'][i],
                         "required_age": item_meta['required_age'][i],
                         "is_free": item_meta['is_free'][i],
                         "developers": item_meta['developers'][i],
                         "publishers": item_meta['publishers'][i],
                         "release_date": item_meta['release_date'][i],
                         "recommendations": item_meta['recommendations'][i]
                        }) for i in range(item_meta.shape[0])]

In [33]:
item_features_source = [(item_meta['appid'][i],
                        [item_meta['game_names'][i],
                         item_meta['required_age'][i],
                         item_meta['is_free'][i],
                         item_meta['developers'][i],
                         item_meta['publishers'][i],
                         item_meta['release_date'][i],
                         item_meta['recommendations'][i]
                        ]) for i in range(item_meta.shape[0])]

In [34]:
item_features_source[0]

(10, ['Counter-Strike', 0, False, 'Valve', 'Valve', 2000, 121279])

In [35]:
game_features=dataset.build_item_features(item_features_source)

# 2. LightFM Model Function



> ## Model fit, optimize



loss function 으로  WARP ,  정확도는 precision@k 를 사용하여 evalutation 을 해준다.

In [36]:
# 모델 생성 및 학습시킴

In [None]:
model = LightFM(loss='warp') 
train, test = random_train_test_split(interactions=interactions, test_percentage=0.25) # test_percentage 0.2 -> 0.25
model.fit(train, item_features=game_features,
          user_features=user_features, epochs = 10, num_threads = 4, verbose=True) # 에포크 2회 -> 10회

In [37]:
# precision at k 방식으로 평가
train, test = random_train_test_split(interactions=interactions, test_percentage=0.25) # test_percentage 0.2 -> 0.25

In [56]:
def test_data(testmodel_, itemf, userf):
    train_precision_5 = precision_at_k(testmodel_, train, k=5, item_features=itemf,
                                 user_features=userf, num_threads = 4).mean()
    train_precision_10 = precision_at_k(testmodel_, train, k=10, item_features=itemf,
                                 user_features=userf, num_threads = 4).mean()
    test_precision_10 = precision_at_k(testmodel_, test, k=10, item_features=itemf,
                                user_features=userf, num_threads = 4).mean()
    test_precision_5 = precision_at_k(testmodel_, test, k=5, item_features=itemf,
                                user_features=userf, num_threads = 4).mean()
    print( 'Precision: \ntrain \n  5: %.4f, 10: %.4f\ntest \n  5: %.4f, 10: %.4f'
      % ( train_precision_5,train_precision_10, test_precision_5, test_precision_10 ))
    # auc 방식으로 평가
    train_auc = auc_score(testmodel_, train, item_features=itemf,
                      user_features=userf).mean()
    test_auc = auc_score(testmodel_, test, item_features=itemf,
                     user_features=userf).mean()
    print( 'AUC: train %.4f, test %.4f.' 
      % ( train_auc, test_auc))

In [39]:
model = LightFM(loss='warp') 
model.fit(train, item_features=game_features,
          user_features=user_features, epochs = 10, num_threads = 4, verbose=True) # 에포크 2회 -> 10회

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [07:57<00:00, 47.75s/it]


<lightfm.lightfm.LightFM at 0x205b8b10fd0>

In [40]:
print('item, user')
test_data(model, game_features, user_features)

item, user
Precision: 
train 
  5: 0.1182, 10: 0.1033
test 
  5: 0.0405, 10: 0.0355
AUC: train 0.9263, test 0.9253.


In [41]:
model_ = LightFM(loss='warp') 
model_.fit(train, item_features=None,
          user_features=user_features, epochs = 10, num_threads = 4, verbose=True) # 에포크 2회 -> 10회

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [03:59<00:00, 23.97s/it]


<lightfm.lightfm.LightFM at 0x205b8b10d90>

In [58]:
print('Only user')
test_data(model_, None, user_features)

Only user
Precision: 
train 
  5: 0.1178, 10: 0.1033
test 
  5: 0.0405, 10: 0.0355
AUC: train 0.9263, test 0.9253.


In [54]:
model__ = LightFM(loss='warp') 
model__.fit(train, item_features=game_features, epochs = 10, num_threads = 4, verbose=True) # 에포크 2회 -> 10회

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [06:57<00:00, 41.79s/it]


<lightfm.lightfm.LightFM at 0x206ac603a90>

In [57]:
print('Only item')
test_data(model__, game_features, None)

Only item
Precision: 
train 
  5: 0.1835, 10: 0.1565
test 
  5: 0.0570, 10: 0.0481
AUC: train 0.9695, test 0.9515.


In [51]:
model___ = LightFM(loss='warp') 
model___.fit(train, item_features=None,
          user_features=None, epochs = 10, num_threads = 4, verbose=True) # 에포크 2회 -> 10회

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [03:15<00:00, 19.51s/it]


<lightfm.lightfm.LightFM at 0x206ac50bc70>

In [59]:
print('no features')
test_data(model___, None, None)

no features
Precision: 
train 
  5: 0.1900, 10: 0.1625
test 
  5: 0.0593, 10: 0.0501
AUC: train 0.9738, test 0.9560.


In [91]:
from sklearn.metrics import ndcg_score


추천에서는 성능 지표로 precision@k 를 많이 사용하고 

Precision@5 = 0.05 (test) 는 순위로 rank 가 된 게임 중 (k=) 5번째 내에 관심 있을 게임이 나올 정확도가 평균 5% 라는 것입니다. 수치가 매우 낮아보이지만, user/content 수가 많은 real 환경에서 5%를 넘기는 어렵다.
​

AUC는 모델의 전체 적인 성능을 나타내는 지표이다.

​



> ## Predict score & Recommend games 



In [200]:
users['author'] = users['author'].reset_index().drop(columns=['index'])

In [163]:
def recommendation(model, data, to_recommend): # 인자: model, interactions, to_recommend
    labels = items[['appid']]
    n_users, n_items = data.shape

    scores = model.predict(to_recommend, np.arange(n_items)) 
    labels['scores'] = scores.tolist()
    labels.sort_values(by = ['scores'], ascending = False,inplace = True)
    
    return labels



*   group에 author id를 입력하면 모든 games 의 예측 평점을 계산하고
ex) user 1 -> group = [0], user 2 = group = [1] 
*   list_no에 원하는 게임 수를 입력하면 높은순으로 games(id:score)를 리턴합니다.

In [201]:
#author = str(input('유저 입력: '))
author = str(76561198995116208)
idx = int(users.loc[users['author']== author].index[0]) # predict의 첫번째 인자가 user의 index이므로 변환
print(idx)

424204


In [214]:
known_positives = ratings[ratings['author']==author]
known_positives = known_positives['appid']
known_positives = known_positives.values.tolist()
for kp in known_positives:
  print('name:', games.loc[games['appid'] == kp].iloc[0][1])

name: Fall Guys: Ultimate Knockout
name: eFootball PES 2021 SEASON UPDATE
name: Dying Light
name: The Witcher® 3: Wild Hunt
name: Sniper Ghost Warrior 3
name: Friday the 13th: The Game
name: Age of History II
name: Yet Another Zombie Defense HD
name: Stick Fight: The Game
name: CHANGE: A Homeless Survival Experience


In [217]:
list_no = 15 # 임의로 15개 추천

top_items = recommendation(model, interactions, idx)

x=0
while x<list_no:
    appid = top_items.iloc[x][0]
    if appid not in known_positives:
        print('name:', games.loc[games['appid']==appid].iloc[0][1], '  score:', round(top_items.iloc[x][1], 3)) # 소수점 넷째자리에서 반올림
    x+=1

name: F.E.A.R.   score: 0.5
name: Euro Truck Simulator 2   score: 0.48
name: Far Cry® 5   score: 0.472
name: Tom Clancy's Rainbow Six® Siege   score: 0.468
name: DayZ   score: 0.461
name: Divinity: Original Sin 2 - Definitive Edition   score: 0.458
name: PUBG: BATTLEGROUNDS   score: 0.456
name: Monster Hunter: World   score: 0.442
name: Cities: Skylines   score: 0.427
name: Risk of Rain 2   score: 0.427
name: ARK: Survival Evolved   score: 0.42
name: Counter-Strike: Global Offensive   score: 0.416
name: Grand Theft Auto V   score: 0.416
name: Plants vs. Zombies GOTY Edition   score: 0.412


In [213]:
known_positives.values.tolist()

[['76561198995116208', 1097150, 1037.0, 335, True, 1],
 ['76561198995116208', 1259970, 7806.0, 205, True, 0],
 ['76561198995116208', 239140, 1124.0, 556, True, 0],
 ['76561198995116208', 292030, 521.0, 107, True, 3],
 ['76561198995116208', 368070, 39.0, 38, False, 0],
 ['76561198995116208', 438740, 3685.0, 302, True, 5],
 ['76561198995116208', 603850, 1700.0, 522, True, 9],
 ['76561198995116208', 674750, 283.0, 17, True, 0],
 ['76561198995116208', 674940, 1835.0, 974, True, 1],
 ['76561198995116208', 926140, 199.0, 176, True, 0]]

In [72]:
# ndcg 점수 평가
from sklearn.metrics import ndcg_score
from tqdm.notebook import tqdm

In [None]:
# 현재 이 부분에서 전체 ineractions matrix에 대한 score를 추출하고, score_data를 ineractions matrix의 최대/최소 범위에 대해 정규화해서
# scikit-learn의 ndcg_score에 넣어 ndcg 평가를 수행할 계획이었음. 

In [60]:
score_data['author'] = score_data['author'].astype('str')

In [61]:
score_data.drop_duplicates(inplace = True)

In [66]:
items.reset_index(inplace = True)

In [76]:
items.drop(columns=['index'], inplace=True)

In [77]:
items

Unnamed: 0,appid
0,10
1,100
2,1000010
3,1000030
4,1000080
...,...
11561,998930
11562,99900
11563,999020
11564,999220


In [88]:
def get_ndcg(test_model, user_f, item_f):
    n_users = 100 # interactions.shape[0]에서 세션 다운(램에 제한 존재) 문제 때문에 임의로 100개 유저 기준으로 테스트
    k = 5
    ndcg_s = []

    for i in tqdm(range(n_users)):
      user = users.iloc[i][0]
      appid_for_top_k = score_data.loc[score_data['author'] == user].sort_values('scores', ascending = False)['appid'][:k] # user의 score_data가 존재하는 앱 추출
      n_items = []
      for j in appid_for_top_k:
        n_items.append(items.loc[items['appid'] == j].index[0])
      # scores_by_recsys, scores_by_true의 범위가 다르므로 정규화 진행
      scores_by_recsys = np.reshape(test_model.predict(i, np.asarray(n_items, dtype = np.int32), user_features=user_f, item_features=item_f), (-1, k))
      scores_by_true = np.reshape(np.asarray(score_data.loc[score_data['author'] == user].sort_values('scores', ascending = False)['scores'][:k], dtype = np.float32),  (-1, k))

      norms_by_recsys = np.linalg.norm(scores_by_recsys)
      norms_by_true = np.linalg.norm(scores_by_true)

      scores_by_recsys = scores_by_recsys / 10
      scores_by_true = scores_by_true

      ndcg_s.append(ndcg_score(scores_by_true, scores_by_recsys))
    ndcg_s = sum(ndcg_s) / n_users
    return ndcg_s

In [89]:
print('both features')
print(get_ndcg(model, user_features, game_features))

both features


  0%|          | 0/100 [00:00<?, ?it/s]

0.9669108207186489


In [93]:
print('only user')
print(get_ndcg(model_, user_features, None))

only user


  0%|          | 0/100 [00:00<?, ?it/s]

0.9678637781297236


In [95]:
print('only item')
print(get_ndcg(model__, None, game_features))

only item


  0%|          | 0/100 [00:00<?, ?it/s]

0.9620796194687113


In [96]:
print('no features')
print(get_ndcg(model___, None, None))

no features


  0%|          | 0/100 [00:00<?, ?it/s]

0.965200218997247


In [142]:
scored_data['scores'] = (scored_data['scores']-10)/10

In [145]:
len(scores)

11566