In [30]:
%pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import time
import os
import zipfile
import csv
import requests
import json
from itertools import islice

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split 

from scipy.sparse import coo_matrix as sp

import warnings # warings 제거
warnings.filterwarnings(action='ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 초기설정시 기본 환경설정

In [None]:
%cd drive/MyDrive

[Errno 2] No such file or directory: 'drive/MyDrive'
/content


In [None]:
%mkdir steam 

In [None]:
%cd steam

/content/drive/MyDrive/steam


In [None]:
!unzip 'steam.zip' 

Archive:  steam.zip
  inflating: parsed_data.csv         
  inflating: parsed_data_only_games.csv  
  inflating: parsed_data_overten.csv  
  inflating: steam_store_data.json   
  inflating: Steam_store_data.pptx   
  inflating: ~$Steam_store_data.pptx  


In [None]:
# 데이터 로드 (type : dataframe)

In [21]:
with open('/content/drive/MyDrive/steam/steam_store_data.json', 'r') as game_data:
    game_data = pd.read_json(game_data)

In [22]:
with open('/content/drive/MyDrive/steam/parsed_data_overten_reset.csv', 'r') as review_data:
    review_data = pd.read_csv(review_data)

In [None]:
with open('/content/drive/MyDrive/steam/score_data_overten_pos_neg.csv', 'r') as score_data: # ndcg
    score_data = pd.read_csv(score_data)

# 1. Data structure and embedding




> ## Data loading, preprocessing




**user_features** : user의 특성(예시: 성별, 지역, 선호유형)

**item_features**: item의 특성(예시: 제목, 가격, 장르, 국가, 게시 시간)

**interactions** : user-item간의 data(평점)

> ## 데이터프레임 형태 변형 및 확인

In [23]:
game_data = game_data.transpose() # 행렬 뒤집기

In [24]:
game_data = game_data[['steam_appid', 'name']] 
review_data = review_data[['author', 'appid', 'playtime_forever', 'playtime_at_review', 'voted_up', 'votes_up']] 

In [25]:
# 변수명 명시적으로 변경함

ratings = review_data 
games = game_data 

In [26]:
ratings['author'] = ratings['author'].astype('str') # ratings['author']의 타입이 numpy.int64 이므로 str로 변환



> ## Data embedding
위의 데이터를 embedding dataset으로 build



In [None]:
# 데이터프레임 중복 확인

In [27]:
users = ratings[['author']]
users.drop_duplicates(inplace = True)

In [100]:
users.shape[0]

746243

In [28]:
ratings.drop_duplicates(inplace = True) # 14916136 -> 13828950로 중복 행 제거

In [29]:
items = ratings[['appid']]
items.drop_duplicates(inplace = True)

In [104]:
items.shape[0] # 총 게임 개수는 51927인데 반해 리뷰에 있는 게임 개수는 11566(따라서 interactions의 item 개수는 11566)

11566

In [31]:
games.drop_duplicates(inplace = True) # 51936 -> 51927로 중복 행 제거

In [None]:
# user-item 사이의 interactions matrix 생성

In [33]:
uf = []

playtime_forever = ratings[['playtime_forever']]
playtime_forever.drop_duplicates(inplace = True)

for i in range(playtime_forever.shape[0]):
  uf.append('playtime_forever:{0}'.format(playtime_forever.iloc[i][0]))

In [34]:
dataset = Dataset()
dataset.fit(users=users['author'], items=items['appid'],user_features = uf)

In [35]:
ratings_source = ratings[['author', 'appid']]
ratings_source.drop_duplicates(inplace = True)
ratings_source = ratings_source.values.tolist()

In [36]:
interactions, weights = dataset.build_interactions(ratings_source)

In [None]:
# user_features 빌드

In [38]:
user_features_source = ratings[['author', 'playtime_forever']]
user_features_source.drop_duplicates(inplace = True)

In [39]:
user_features_source = user_features_source[['author', 'playtime_forever']].values.tolist()

In [40]:
user_features_source = [[i[0], ['playtime_forever:{0}'.format(i[1])]] for i in user_features_source]

In [41]:
user_features=dataset.build_user_features(user_features_source)

# 2. LightFM Model Function



> ## Model fit, optimize



loss function 으로  WARP ,  정확도는 precision@k 를 사용하여 evalutation 을 해준다.

In [None]:
# 모델 생성 및 학습시킴

In [42]:
model = LightFM(loss='warp') 
train, test = random_train_test_split(interactions=interactions, test_percentage=0.25) # test_percentage 0.2 -> 0.25
model.fit(train, item_features=None, user_features=user_features, epochs = 10, num_threads = 4) # 에포크 2회 -> 10회

<lightfm.lightfm.LightFM at 0x7fe64b39b1f0>

In [None]:
# precision at k 방식으로 평가

In [None]:
train_precision = precision_at_k(model, train, k=5, item_features=None,user_features=user_features, num_threads = 4).mean()

KeyboardInterrupt: ignored

In [None]:
test_precision = precision_at_k(model, test, k=10, item_features=None,user_features=user_features, num_threads = 4).mean()

In [None]:
print( 'Precision: train %.4f, test %.4f' 
      % ( train_precision,test_precision ))

In [None]:
# auc 방식으로 평가

In [None]:
train_auc = auc_score(model, train, item_features=None,user_features=user_features).mean()

In [None]:
test_auc = auc_score(model, test, item_features=None,user_features=user_features).mean()

In [None]:
print( 'AUC: train %.4f, test %.4f.' 
      % ( train_auc, test_auc))

In [14]:
%pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from sklearn.metrics import ndcg_score


추천에서는 성능 지표로 precision@k 를 많이 사용하고 

Precision@5 = 0.05 (test) 는 순위로 rank 가 된 게임 중 (k=) 5번째 내에 관심 있을 게임이 나올 정확도가 평균 5% 라는 것입니다. 수치가 매우 낮아보이지만, user/content 수가 많은 real 환경에서 5%를 넘기는 어렵다.
​

AUC는 모델의 전체 적인 성능을 나타내는 지표이다.

​



> ## Predict score & Recommend games 



In [130]:
labels = items[['appid']]

In [140]:
def recommendation(model, data, to_recommend): # 인자: model, interactions, to_recommend

    n_users, n_items = data.shape

    scores = model.predict(to_recommend, np.arange(n_items)) 
    labels['scores'] = scores.tolist()
    labels.sort_values(by = ['scores'], ascending = False,inplace = True)
    
    return labels



*   group에 author id를 입력하면 모든 games 의 예측 평점을 계산하고
ex) user 1 -> group = [0], user 2 = group = [1] 
*   list_no에 원하는 게임 수를 입력하면 높은순으로 games(id:score)를 리턴합니다.

In [118]:
author = str(input('유저 입력: '))

idx = int(users.loc[users['author']== author].index[0]) # predict의 첫번째 인자가 user의 index이므로 변환

유저 입력: 76561198905733625


In [165]:
list_no = 5 # 임의로 5개 추천

top_items = recommendation(model, interactions, idx)

for x in range(list_no):
  print('name:', games.loc[games['steam_appid'] == top_items.iloc[x][0]].iloc[0][1], '  score:', round(top_items.iloc[x][1], 3)) # 소수점 넷째자리에서 반올림

name: Serious Sam 2   score: 2.392
name: Atonement: Scourge of Time   score: 2.159
name: LEGRAND LEGACY: Tale of the Fatebounds   score: 1.994
name: STAR WARS™ - Dark Forces   score: 1.946
name: Lost Horizon   score: 1.896


In [None]:
# ndcg 점수 평가

In [None]:
# 현재 이 부분에서 전체 ineractions matrix에 대한 score를 추출하고, score_data를 ineractions matrix의 최대/최소 범위에 대해 정규화해서
# scikit-learn의 ndcg_score에 넣어 ndcg 평가를 수행할 계획이었음. 

In [37]:
n_users, n_items = interactions.shape
scores = model.predict(np.arange(n_items), np.arange(n_items)) 

NameError: ignored

In [None]:
# 추가로 정확도 개선, predict_rank()를 이용해 새로운 유저의 리뷰 데이터에 대해서도 추천이 가능하게 만들어야함.

In [1]:
!git remote add origin 

fatal: not a git repository (or any of the parent directories): .git
