In [1]:
def readJSON(path):
    for l in open(path, 'r'):
        d = eval(l)
        h = d['hours_transformed']
        u = d['userID']
        g = d['gameID']
        yield u,g,h,d

data = []

for uid, gid, hour, d in readJSON("train.json"):
    data.append({"user_id":uid, "game_id":gid})

data

[{'user_id': 'u55351001', 'game_id': 'g35322304'},
 {'user_id': 'u70666506', 'game_id': 'g49368897'},
 {'user_id': 'u18612571', 'game_id': 'g73495588'},
 {'user_id': 'u34283088', 'game_id': 'g68047320'},
 {'user_id': 'u16220374', 'game_id': 'g51234623'},
 {'user_id': 'u01499286', 'game_id': 'g25723374'},
 {'user_id': 'u73063505', 'game_id': 'g58025004'},
 {'user_id': 'u29223775', 'game_id': 'g69033010'},
 {'user_id': 'u44401308', 'game_id': 'g46446145'},
 {'user_id': 'u45027672', 'game_id': 'g02903254'},
 {'user_id': 'u33908704', 'game_id': 'g66086214'},
 {'user_id': 'u27998358', 'game_id': 'g21544048'},
 {'user_id': 'u36214177', 'game_id': 'g86787099'},
 {'user_id': 'u73747744', 'game_id': 'g23131507'},
 {'user_id': 'u97936673', 'game_id': 'g65055990'},
 {'user_id': 'u25365202', 'game_id': 'g45124396'},
 {'user_id': 'u08631099', 'game_id': 'g74380807'},
 {'user_id': 'u52584928', 'game_id': 'g46738138'},
 {'user_id': 'u09520763', 'game_id': 'g65420497'},
 {'user_id': 'u04893836', 'game

In [2]:
import pandas as pd

df = pd.DataFrame(data)

df

Unnamed: 0,user_id,game_id
0,u55351001,g35322304
1,u70666506,g49368897
2,u18612571,g73495588
3,u34283088,g68047320
4,u16220374,g51234623
...,...,...
174995,u77930643,g88529278
174996,u01497310,g89122297
174997,u38860000,g59833534
174998,u17742070,g03262233


In [3]:
df.head()

Unnamed: 0,user_id,game_id
0,u55351001,g35322304
1,u70666506,g49368897
2,u18612571,g73495588
3,u34283088,g68047320
4,u16220374,g51234623


In [4]:
from sklearn.preprocessing import LabelEncoder

# 범주형 변수 라벨 인코딩
user_encoder = LabelEncoder()
game_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['game_id'] = game_encoder.fit_transform(df['game_id'])
df

Unnamed: 0,user_id,game_id
0,3684,871
1,4751,1209
2,1242,1800
3,2317,1652
4,1069,1244
...,...,...
174995,5214,2167
174996,91,2181
174997,2600,1466
174998,1177,89


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

# 학습용과 테스트용 데이터로 분리
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)


In [7]:
from lightfm.data import Dataset

# Dataset 객체 생성 및 사용자, 게임, 상호작용 데이터 추가
dataset = Dataset()
dataset.fit((x['user_id'] for _, x in df.iterrows()),
            (x['game_id'] for _, x in df.iterrows()))

(interactions, _) = dataset.build_interactions(((x['user_id'], x['game_id']) for _, x in train_data.iterrows()))
(test_interactions, _) = dataset.build_interactions(((x['user_id'], x['game_id']) for _, x in test_data.iterrows()))

In [8]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

# BPR 모델 학습
model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(interactions, epochs=30, num_threads=2)

# 모델 평가
train_precision = precision_at_k(model, interactions, k=10).mean()
test_precision = precision_at_k(model, test_interactions, k=10).mean()
test_auc = auc_score(model, test_interactions).mean()

print(f'Precision at k=10 (train): {train_precision}')
print(f'Precision at k=10 (test): {test_precision}')
print(f'AUC score (test): {test_auc}')

Precision at k=10 (train): 0.1983601599931717
Precision at k=10 (test): 0.006465960294008255
AUC score (test): 0.5935676097869873


In [10]:
from collections import Counter

# 상위 70% 게임 선택
game_count = Counter(x['game_id'] for _, x in df.iterrows())
total_played = sum(game_count.values())

return_game_set = set()
count_sum = 0

for gid, play_count in game_count.most_common():
    return_game_set.add(gid)
    count_sum += play_count
    if count_sum > total_played * 7 / 10:
        break

In [11]:
# 특정 사용자와 게임에 대한 플레이 여부 예측 함수
def predict_play(user_id, game_id):
    try:
        # 예측 점수 계산
        score = model.predict(user_id, game_id)
        return score > 0  # 점수가 0보다 크면 플레이할 가능성이 높음
    except KeyError:
        return False  # 사용자 또는 게임이 데이터셋에 없는 경우 False 반환

In [13]:
with open("pairs_Played.csv", "r") as f1:
    f1.readline()  # 첫 번째 줄 (열 이름) 건너뛰기

    with open("fourteen_coin.csv", "w") as f2:
        f2.write("ID,Label\n")  # 첫 번째 줄에 열 이름 추가

        for line in f1:
            row_id, uid, gid = line.strip().split(",")
            uid = user_encoder.fit_transform(uid)
            git = game_encoder.fit_transform(gid)
            play_prediction = predict_play(uid, gid)
            # if gid in return_game_set:
            #     f2.write(f"{row_id},1\n")
            # else:
            #     f2.write(f"{row_id},0\n")


ValueError: Expected the number of user IDs (1) to equal the number of item IDs (9)