In [1]:
import heapq
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import os.path
import numpy as np
import pandas as pd
import time
import multiprocessing
from data.worker import worker
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

## 1. 데이터 가공
userId_problemId.csv 파일 필요

In [2]:
df = pd.read_csv("./data/userId_problemId.csv").loc[:,['userId', 'problemId']]
df.head()

Unnamed: 0,userId,problemId
0,sos0911,1000
1,sos0911,1001
2,sos0911,1002
3,sos0911,1003
4,sos0911,1005


### 1) userId remapping

In [3]:
def remap_user():
    user_problem_df = pd.read_csv("./data/userId_problemId.csv").loc[:, ['userId', 'problemId']]
    user_df = pd.DataFrame({"user_id": [], "remap_id": []})
    unique_sorted_user = sorted(user_problem_df['userId'].unique().tolist())
    for i in range(len(unique_sorted_user)):
        new_user_df = pd.DataFrame({"user_id": [unique_sorted_user[i]], "remap_id": [i]})
        user_df = pd.concat((user_df, new_user_df))
        
    user_df.to_csv("./data/user_list.csv", index=False)
    print('user_list file saved successfully')

### 2) problemId remapping

In [4]:
def remap_problem():
    user_problem_df = pd.read_csv("./data/userId_problemId.csv").loc[:, ['userId', 'problemId']]
    problem_df = pd.DataFrame({"problem_id": [], "remap_id": []})
    unique_sorted_problem = sorted(user_problem_df['problemId'].unique().tolist())
    for i in range(len(unique_sorted_problem)):
        new_problem_df = pd.DataFrame({"problem_id": [unique_sorted_problem[i]], "remap_id": [i]})
        problem_df = pd.concat((problem_df, new_problem_df))
    
    problem_df.to_csv("./data/problem_list.csv", index=False)
    print('problem_list file saved successfully')

### 3) Convert userId, problemId

In [5]:
def remap_user_problem():
    user_problem_df = pd.read_csv("./data/userId_problemId.csv").loc[:, ['userId', 'problemId']]
    user_df = pd.read_csv("./data/user_list.csv")
    problem_df = pd.read_csv("./data/problem_list.csv")

    manager = multiprocessing.Manager()
    user_problem_remap_df = pd.DataFrame({'userId': [], 'problemId': []})

    for i in range(0, user_problem_df['userId'].size, 10000):
        clear_output(wait=True)
        print('Loading: [{}]'.format('-' * (i // 10000) + '>' + '-' * (83 - i // 10000)))
        
        return_dict = manager.dict()
        jobs = []
        
        for j in range(4):
            p = multiprocessing.Process(target=worker, args=(j, i + 2500 * j, user_df, problem_df, user_problem_df[i + 2500 * j:i + min(user_problem_df['userId'].size, 2500*(j+1))], return_dict))
            jobs.append(p)
            p.start()
            if i + 2500 * (j+1) >= user_problem_df['userId'].size: break
        for proc in jobs:
            proc.join()
            proc.close()
        for j in range(len(return_dict.keys())):
            user_problem_remap_df = pd.concat((user_problem_remap_df, return_dict[j]))
    user_problem_remap_df.to_csv("./data/userId_problemId_remap.csv", index=False)
    print('userId_problemId_remap file saved successfully')

In [6]:
remap_user()

user_list file saved successfully


In [7]:
remap_problem()

problem_list file saved successfully


multiprocessing을 사용하여 30분 이상 걸리던 작업을 13분으로 단축

In [8]:
remap_user_problem()

Loading: [----------------------------------------------------------------------------------->]
userId_problemId_remap file saved successfully


### train data, test data 생성

In [45]:
def making_data():
    user_problem_remap_df = pd.read_csv("./data/userId_problemId_remap.csv").loc[:, ['userId', 'problemId']]
    train_data_df = pd.DataFrame({'userId': [], 'problemId': []}, dtype=int)
    test_data_df = pd.DataFrame({'userId': [], 'problemId': []}, dtype=int)
    
    for user_id in user_problem_remap_df['userId'].unique():
        problem_list = user_problem_remap_df[user_problem_remap_df['userId'] == user_id]['problemId'].tolist()
        train_problem_list = random.sample(problem_list, int(len(problem_list) * 0.8))
        test_problem_list = list(set(problem_list) - set(train_problem_list))
        
        new_train_data_df = pd.DataFrame({'userId': [user_id], 'problemId': [train_problem_list]})
        new_test_data_df = pd.DataFrame({'userId': [user_id], 'problemId': [test_problem_list]})
        
        train_data_df = pd.concat((train_data_df, new_train_data_df))
        test_data_df = pd.concat((test_data_df, new_test_data_df))
    
    with open("./data/train.txt", "w") as f:
        for user_id in sorted(train_data_df['userId'].tolist()):
            problem_list = [*map(int, train_data_df[train_data_df['userId'] == user_id]['problemId'].tolist()[0])]
            f.write(str(user_id) + ' ' + ' '.join(map(str, problem_list)) + '\n')
    
    with open("./data/test.txt", "w") as f:
        for user_id in sorted(test_data_df['userId'].tolist()):
            problem_list = [*map(int, test_data_df[test_data_df['userId'] == user_id]['problemId'].tolist()[0])]
            f.write(str(user_id) + ' ' + ' '.join(map(str, problem_list)) + '\n')
        
    print('train.txt, test.txt saved successfully')

In [46]:
making_data()

train.txt, test.txt saved successfully


# NGCF 모델 관련 코드

### 전역변수 설정

In [9]:
train_items, test_set = {}, {}
matrix = None
exist_users = []
global_epoch_value = 0
result_arr = []

n_users, n_items, n_train, n_test = 0, 0, 0, 0
total_epoch = 20
embed_size = 64
batch_size = 1024
layer_size = [64, 64, 64]

### NGCF 모델

In [10]:
class NGCF(nn.Module):
    def __init__(self, n_user, n_item, norm_adj, emb_size, batch_size, layer_size):
        super(NGCF, self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.norm_adj = norm_adj
        self.emb_size = emb_size
        self.batch_size = batch_size
        self.layer_size = layer_size

        self.embedding_dict, self.weight_dict = self.init_weight()

    def init_weight(self):
        embedding_dict = nn.ParameterDict({
            'user_emb': nn.Parameter(nn.init.xavier_uniform_(torch.empty(self.n_user, self.emb_size))),
            'item_emb': nn.Parameter(nn.init.xavier_uniform_(torch.empty(self.n_item, self.emb_size)))
        })

        weight_dict = nn.ParameterDict()
        layers = [self.emb_size] + self.layer_size
        for i in range(len(self.layer_size)):
            weight_dict['W_gc_%d' % i] = nn.Parameter(nn.init.xavier_uniform_(torch.empty(layers[i], layers[i + 1])))
            weight_dict['W_bi_%d' % i] = nn.Parameter(nn.init.xavier_uniform_(torch.empty(layers[i], layers[i + 1])))

        return embedding_dict, weight_dict

    def rating(self, u_g_embeddings, pos_i_g_embeddings):
        return torch.matmul(u_g_embeddings, pos_i_g_embeddings.t())

    def forward(self, users, pos_items, neg_items):
        ego_embeddings = torch.cat([self.embedding_dict['user_emb'], self.embedding_dict['item_emb']], 0)
        all_embeddings = [ego_embeddings]

        for i in range(len(self.layer_size)):
            sum_embeddings = torch.matmul(ego_embeddings, self.weight_dict['W_gc_%d' % i])
            bi_embeddings = torch.matmul(ego_embeddings.T, self.norm_adj)
            bi_embeddings = torch.matmul(bi_embeddings.T, self.weight_dict['W_gc_%d' % i])

            ego_embeddings = nn.LeakyReLU(negative_slope=0.2)(sum_embeddings + bi_embeddings)
            norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1)
            all_embeddings += [norm_embeddings]

        all_embeddings = torch.cat(all_embeddings, 1)
        u_g_embeddings = all_embeddings[:self.n_user, :]
        i_g_embeddings = all_embeddings[self.n_user:, :]

        u_g_embeddings = u_g_embeddings[users, :]
        pos_i_g_embeddings = i_g_embeddings[pos_items, :]
        neg_i_g_embeddings = i_g_embeddings[neg_items, :]

        return u_g_embeddings, pos_i_g_embeddings, neg_i_g_embeddings

    def BPR_Loss(self, users, pos_items, neg_items):
        pos_scores = torch.sum(torch.mul(users, pos_items), axis=1)
        neg_scores = torch.sum(torch.mul(users, neg_items), axis=1)

        maxi = nn.LogSigmoid()(pos_scores - neg_scores)

        mf_loss = -1 * torch.mean(maxi)

        regularizer = (torch.norm(users) ** 2 + torch.norm(pos_items) ** 2 + torch.norm(neg_items) ** 2) / 2
        decay = 1e-5
        emb_loss = decay * regularizer / self.batch_size

        return mf_loss + emb_loss

### 전처리

In [11]:
def data_load():
    global n_users, n_items, n_train, n_test, matrix

    train_file = './data/train.txt'
    test_file = './data/test.txt'

    with open(train_file) as f:
        for line in f.readlines():
            x = line.strip().split()

            user_id = int(x[0])
            exist_users.append(user_id)
            n_users = max(n_users, user_id)

            items = [*map(int, x[1:])]
            n_items = max(n_items, max(items))

            n_train += len(items)

    with open(test_file) as f:
        for line in f.readlines():
            x = line.strip().split()

            items = [*map(int, x[1:])]
            n_items = max(n_items, max(items))
            n_test += len(items)

    n_users += 1
    n_items += 1

    matrix = torch.zeros((n_users, n_items))

    with open(train_file) as f_train:
        with open(test_file) as f_test:
            for line in f_train.readlines():
                x = line.strip().split()
                items = [*map(int, x)]
                user_id, t_items = items[0], items[1:]
                for item in t_items:
                    matrix[user_id, item] = 1
                train_items[user_id] = t_items
            for line in f_test.readlines():
                x = line.strip().split()
                items = [*map(int, x)]
                user_id, t_items = items[0], items[1:]
                test_set[user_id] = t_items

### 학습할 때 필요한 함수

In [12]:
def sample():
    if batch_size <= n_users:
        users = random.sample(exist_users, batch_size)
    else:
        users = [random.choice(exist_users) for i in range(batch_size)]

    pos_items, neg_items = [], []
    for user in users:
        pos_item_list = train_items[user]
        pos_batch = pos_item_list[np.random.randint(0, len(pos_item_list))]
        pos_items += [pos_batch]

        while 1:
            neg_item_list = train_items[user]
            neg_id = np.random.randint(0, n_items)
            if neg_id not in train_items[user] and neg_id not in neg_item_list:
                neg_items.append(neg_id)
                break
    return users, pos_items, neg_items


def get_norm_adj():
    adj_mat = torch.zeros([n_users + n_items, n_users + n_items])
    adj_mat[:n_users, n_users:] = matrix
    adj_mat[n_users:, :n_users] = matrix.T
    rowsum = np.array(adj_mat.sum(1))
    d_inv = rowsum.copy()
    for i in range(rowsum.size):
        if d_inv[i] != 0:
            d_inv[i] = 1 / d_inv[i]
    d_mat_inv = np.diag(d_inv)

    return torch.from_numpy(d_mat_inv.dot(adj_mat))


### 평가할 때 필요한 함수

In [13]:
def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]
    K_max = Ks
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    if global_epoch_value == total_epoch - 1:
        result_arr.append(K_max_item_score)
    r = []
    for val in K_max_item_score:
        if val in user_pos_test:
            r += [1]
        else:
            r += [0]
    return r


def get_performance(r, Ks):
    return np.mean(np.asarray(r)[:Ks])


def test_one_user(x, y):
    rating = x
    user = y
    if len(train_items[user]) == 0:
        training_items = []
    else:
        training_items = train_items[user]

    user_pos_test = test_set[user]
    all_items = set(range(n_items))
    test_items = list(all_items - set(training_items))
    r = ranklist_by_heapq(user_pos_test, test_items, rating, 100)

    return get_performance(r, 100)


def test(model, users_to_test):
    result = 0
    u_batch_size = batch_size * 2

    test_users = users_to_test
    n_test_users = len(test_users)
    n_users_batchs = n_test_users // u_batch_size + 1

    for u_batch_id in range(n_users_batchs):
        start = u_batch_id * u_batch_size
        end = (u_batch_id + 1) * u_batch_size

        user_batch = test_users[start:end]
        item_batch = range(n_items)
        u_g_embedding, pos_i_g_embedding, _ = model(user_batch, item_batch, [])
        rate_batch = model.rating(u_g_embedding, pos_i_g_embedding).detach()

        for i in range(len(user_batch)):
            result += test_one_user(rate_batch.numpy()[i], user_batch[i])

    return result

### 학습 실행 

In [14]:
def get_result(epoch_value = 200, flag = True):
    '''
    :param epoch_value: epoch 값
    :param flag: True면 이미 저장된 값을 return, False면 새로 학습시켜서 return
    :return: 각 유저에게 유사도가 제일 높은 순서로 문제 번호를 반환하는 2차원 리스트
    '''
    global n_users, n_items, n_train, n_test, total_epoch, embed_size, batch_size, global_epoch_value
    n_users, n_items, n_train, n_test = 0, 0, 0, 0
    data_load()
    norm_adj = get_norm_adj()
    total_epoch = epoch_value
    embed_size = 64
    batch_size = 1024
    layer_size = [64, 64, 64]
    result_arr = []

    if flag and os.path.isfile('./data/rank.pkl'):
        print('file exist')
        with open('filename.pkl', 'rb') as f:
            result_arr = pickle.load(f)
        return result_arr
    else:
        print('file does not exist')

    model = NGCF(n_users, n_items, norm_adj, embed_size, batch_size, layer_size)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(total_epoch):
        global_epoch_value = epoch
        time1 = time.time()

        loss = 0
        n_batch = n_train // batch_size + 1
        for idx in range(n_batch):
            optimizer.zero_grad()

            users, pos_items, neg_items = sample()
            u_g_embedding, pos_i_g_embedding, neg_i_g_embedding = model(users, pos_items, neg_items)
            batch_loss = model.BPR_Loss(u_g_embedding, pos_i_g_embedding, neg_i_g_embedding)

            batch_loss.backward()
            optimizer.step()

            loss += batch_loss

        time2 = time.time()
        print(f'Epoch: {epoch}, loss: {loss}, time: {int(time2 - time1)}')

        if (epoch + 1) % 10 == 0:
            users_to_test = list(test_set.keys())
            ret = test(model, users_to_test)
            print(f'Precision: {ret}')
    
    with open('./data/rank.pkl', 'wb') as f:
        pickle.dump(result_arr, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    return result_arr

### 학습된 결과 얻기

In [15]:
print(len(get_result()))

FileNotFoundError: [Errno 2] No such file or directory: './data/train.txt'

## 결과 출력

1. 유저가 이미 풀은 문제는 제외
2. 유저 id와 문제 id가 매핑되어 있으므로 원래대로 바꾸어줌