In [2]:
%pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import time
import os
import zipfile
import csv
import requests
import json
from itertools import islice

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split 

from scipy.sparse import coo_matrix as sp

import warnings # warings 제거
warnings.filterwarnings(action='ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 초기설정시 기본 환경설정

In [None]:
%cd drive/MyDrive

In [None]:
%mkdir steam 

In [None]:
%cd steam

/content/drive/MyDrive/steam


In [None]:
!unzip 'steam.zip' 

Archive:  steam.zip
  inflating: parsed_data.csv         
  inflating: parsed_data_only_games.csv  
  inflating: parsed_data_overten.csv  
  inflating: steam_store_data.json   
  inflating: Steam_store_data.pptx   
  inflating: ~$Steam_store_data.pptx  


In [None]:
# 데이터 로드 (type : dataframe)

In [4]:
with open('/content/drive/MyDrive/steam/steam_store_data.json', 'r') as game_data:
    game_data = pd.read_json(game_data)

In [5]:
with open('/content/drive/MyDrive/steam/parsed_data_overten_reset.csv', 'r') as review_data:
    review_data = pd.read_csv(review_data)

# 1. Data structure and embedding




> ## Data loading, preprocessing




**user_features** : user의 특성(예시: 성별, 지역, 선호유형)

**item_features**: item의 특성(예시: 제목, 가격, 장르, 국가, 게시 시간)

**interactions** : user-item간의 data(평점)

> ## 데이터프레임 형태 변형 및 확인

In [6]:
game_data = game_data.transpose() # 행렬 뒤집기

In [None]:
game_data.head()

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,detailed_description,about_the_game,short_description,supported_languages,...,background,background_raw,content_descriptors,controller_support,achievements,legal_notice,drm_notice,price_overview,ext_user_account_notice,demos
570,game,Dota 2,570,0,True,"[1241930, 652720]",<strong>The most-played game on Steam.</strong...,<strong>The most-played game on Steam.</strong...,"Every day, millions of players worldwide enter...","Bulgarian, Czech, Danish, Dutch, English<stron...",...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}",,,,,,,
730,game,Counter-Strike: Global Offensive,730,0,True,[1766730],Counter-Strike: Global Offensive (CS: GO) expa...,Counter-Strike: Global Offensive (CS: GO) expa...,Counter-Strike: Global Offensive (CS: GO) expa...,"Czech, Danish, Dutch, English<strong>*</strong...",...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': 'Includes intense vio...",full,"{'total': 167, 'highlighted': [{'name': 'Someo...",,,,,
578080,game,PUBG: BATTLEGROUNDS,578080,0,True,,"<strong>LAND, LOOT, SURVIVE!</strong><br>Play ...","<strong>LAND, LOOT, SURVIVE!</strong><br>Play ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,"English, Korean, Simplified Chinese, French, G...",...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': None}",,"{'total': 37, 'highlighted': [{'name': 'Last S...","© 2017 KRAFTON, Inc.<br />\r\nPUBG: BATTLEGROU...",,,,
1063730,game,New World,1063730,0,False,,<h1>New World - Deluxe Edition</h1><p><img sr...,"Explore a thrilling, open-world MMO filled wit...","Explore a thrilling, open-world MMO filled wit...","English<strong>*</strong>, French<strong>*</st...",...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': 'This Game may contai...",,"{'total': 133, 'highlighted': [{'name': 'Repai...","© 2021, Amazon.com, Inc. or its affiliates. Al...",Easy Anti-Cheat,"{'currency': 'USD', 'initial': 3999, 'final': ...",,
440,game,Team Fortress 2,440,0,True,[629330],"<p><strong>""The most fun you can have online""<...","<p><strong>""The most fun you can have online""<...",Nine distinct classes provide a broad range of...,"English<strong>*</strong>, Danish, Dutch, Finn...",...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': 'Includes cartoon vio...",,"{'total': 520, 'highlighted': [{'name': 'Head ...",,,,,


In [None]:
game_data.columns

In [7]:
game_data = game_data[['steam_appid', 'name']] 

In [8]:
review_data = review_data[['author', 'appid', 'playtime_forever', 'playtime_at_review', 'voted_up', 'votes_up']] 

In [None]:
review_data.head()

Unnamed: 0,author,appid,playtime_forever,playtime_at_review,voted_up,votes_up
0,76561198976638577,10,104.0,10,True,0
1,76561198391972225,10,426.0,426,True,1
2,76561198414408167,10,386.0,199,True,0
3,76561198376936422,10,165.0,184,True,0
4,76561198905733625,10,28.0,40,True,0


In [9]:
# 변수명 명시적으로 변경함

ratings = review_data 
games = game_data 

In [None]:
# ratings['author']의 타입이 numpy.int64 이므로 str로 변환

In [10]:
ratings['author'] = ratings['author'].astype('str') 



> ## Data embedding
위의 데이터를 embedding dataset으로 build



In [None]:
# 데이터프레임 중복 확인

In [None]:
ratings.shape[0]

In [11]:
ratings.drop_duplicates(inplace = True) # 14916136 -> 13828950로 중복 행 제거

In [None]:
games.shape[0]

In [12]:
games.drop_duplicates(inplace = True) # 51936 -> 51927로 중복 행 제거

In [None]:
# user-item 사이의 interactions matrix 생성

In [37]:
uf = []

playtime_forever = ratings[['playtime_forever']]
playtime_forever.drop_duplicates(inplace = True)

for i in range(playtime_forever.shape[0]):
  uf.append('playtime_forever:{0}'.format(playtime_forever.iloc[i][0]))

In [43]:
dataset = Dataset()
dataset.fit(users=ratings['author'],items=ratings['appid'],user_features = uf)

In [14]:
ratings_source = ratings[['author', 'appid']].values.tolist() # interactions 빌드에 필요한 컬럼만 추출

In [44]:
interactions, weights = dataset.build_interactions(ratings_source)

In [None]:
# user_features 빌드

In [16]:
user_features_source = ratings[['author', 'playtime_forever']]
user_features_source.drop_duplicates(inplace = True)

In [26]:
user_features_source = user_features_source[['author', 'playtime_forever']].values.tolist()

In [29]:
user_features_source = [[i[0], ['playtime_forever:{0}'.format(i[1])]] for i in user_features_source]

In [45]:
user_features=dataset.build_user_features(user_features_source)

# 2. LightFM Model Function



> ## Model fit, optimize



loss function 으로  WARP ,  정확도는 precision@k 를 사용하여 evalutation 을 해준다.

In [None]:
# 모델 생성 및 학습시킴

In [68]:
model = LightFM(loss='warp') 
train, test = random_train_test_split(interactions=interactions, test_percentage=0.25) # test_percentage 0.2 -> 0.25
model.fit(train, item_features=None, user_features=user_features, epochs = 10, num_threads = 4) # 에포크 2회 -> 10회

<lightfm.lightfm.LightFM at 0x7f36f6e3c250>

In [None]:
# precision at k 방식으로 평가

In [69]:
train_precision = precision_at_k(model, train, k=5, item_features=None,user_features=user_features, num_threads = 4).mean()

In [71]:
test_precision = precision_at_k(model, test, k=10, item_features=None,user_features=user_features, num_threads = 4).mean()

In [72]:
print( 'Precision: train %.4f, test %.4f' 
      % ( train_precision,test_precision ))

Precision: train 0.1286, test 0.0376


In [None]:
# auc 방식으로 평가

In [None]:
train_auc = auc_score(model, train, item_features=None,user_features=user_features).mean()

In [None]:
test_auc = auc_score(model, test, item_features=None,user_features=user_features).mean()

In [None]:
print( 'AUC: train %.4f, test %.4f.' 
      % ( train_auc, test_auc))

In [None]:
# ndcg_score 평가 방법 추가(아직 적용 안함)

In [63]:
%pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [64]:
from sklearn.metrics import ndcg_score

추천에서는 성능 지표로 precision@k 를 많이 사용하고 

Precision@5 = 0.05 (test) 는 순위로 rank 가 된 게임 중 (k=) 5번째 내에 관심 있을 게임이 나올 정확도가 평균 5% 라는 것입니다. 수치가 매우 낮아보이지만, user/content 수가 많은 real 환경에서 5%를 넘기는 어렵다.
​

AUC는 모델의 전체 적인 성능을 나타내는 지표이다.

​

In [None]:
# git 명령어

In [159]:
!git init

Reinitialized existing Git repository in /content/.git/


In [None]:
!git add .

hint: You've added another git repository inside your current repository.
hint: Clones of the outer repository will not contain the contents of
hint: the embedded repository and will not know how to obtain it.
hint: If you meant to add a submodule, use:
hint: 
hint: 	git submodule add <url> drive/MyDrive/LightFM
hint: 
hint: If you added this path by mistake, you can remove it from the
hint: index with:
hint: 
hint: 	git rm --cached drive/MyDrive/LightFM
hint: 
hint: See "git help submodule" for more information.
error: open("drive/MyDrive/제목 없는 프레젠테이션 (1).gslides"): Operation not supported
error: unable to index file drive/MyDrive/제목 없는 프레젠테이션 (1).gslides
fatal: adding files failed


In [None]:
!git commit -m "[Feature] 추천시스템 구현 수정"

In [None]:
!git push origin main



> ## Predict score & Recommend games 



In [120]:
# labels = np.array(ratings['appid'])

# def recommendation(model, data, to_recommend):

#     n_users, n_items = data.shape

#     # build a structure to store user scores for each item
#     all_scores = np.empty(shape=(0, n_items))
#     scores = model.predict(to_recommend, np.arange(n_items), item_features = None, user_features = user_features) 
#     top_items_for_user = labels[np.argsort(-scores)] #argsort():오름차순인덱스반환
#     all_scores = np.vstack((all_scores, scores))

#     #compute the average rating for each item in the group
#     item_averages = np.mean(all_scores.astype(np.float), axis=0)
#     top_items_for_group = labels[np.argsort(-item_averages)]
#     top_items_scores=item_averages[np.argsort(-item_averages)]

#     return  top_items_for_user, top_items_scores



*   group에 author id를 입력하면 모든 games 의 예측 평점을 계산하고
ex) user 1 -> group = [0], user 2 = group = [1] 
*   list_no에 원하는 게임 수를 입력하면 높은순으로 games(id:score)를 리턴합니다.

In [121]:
# to_recommend = str(input())
# list_no = 5 # precision at k 토대로 k인 5개로 임의로 정함

# top_items_for_group, top_items_scores = recommendation(model, interactions, to_recommend)

# recommend_steam=pd.DataFrame(columns=games.columns)
# for x in list[:list_no]: 
#   recommend_steam=recommend_steam.append(games[games['steam_appid']== x], ignore_index=True)

# recommend_steam['predict_score']= top_items_scores[:list_no]
# recommend_steam

76561198976638577


TypeError: ignored