In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
from scipy import sparse
import matplotlib.pyplot as plt
from ast import literal_eval
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# 1. 학습
## 1-1. 데이터 로드

In [34]:
import os
import pandas as pd
import numpy as np

class Loader:
    def __init__(self, DIR, data_no, neg_no):
        self.DIR = DIR
        self.neg_no = neg_no
        self.data_no = data_no
  
    def load_dataset(self):
        data_no = str(self.data_no)
        self.DATASET_DIR = self.DIR + 'data' + data_no

        # 데이터 로드
        self.train = pd.read_csv(os.path.join(self.DATASET_DIR, 'd'+data_no+'_train.csv'))
        self.valid_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_X.csv'))
        self.valid_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_y.csv'))
        self.test_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_X.csv'))
        self.test_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_y.csv'))
        self.users = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_users.csv'))
        self.problems = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_problems.csv'))
        
        # 변수
        self.users_no = len(self.users)
        self.prob_no = len(self.problems)
        self.userid2idx = {row[1]:row[0] for row in self.users.values}
        self.useridx2id = {row[0]:row[1] for row in self.users.values}
        self.probid2idx = {row[1]:row[0] for row in self.problems.values}
        self.probidx2id = {row[0]:row[1] for row in self.problems.values}
        self.useridx2level = {i:row[1] for i, row in enumerate(self.users.values)}
        self.probidx2level = {i:row[1] for i, row in enumerate(self.problems.values)}
        return self.users_no, self.prob_no
    
    def formatting(self, case):
        # 모델 input 형식에 맞게 바꾸기
        if case == 0:
          train = self.train_formating(self.train)
          return train
        elif case == 1: # valid
          train = self.train_formating(self.valid_X)
          neg = self.get_negative_sampling(self.valid_y)
          return train, (self.valid_y, neg)
        else: # test
          train = self.train_formating(self.test_y)
          neg = self.get_negative_sampling(self.test_y)
          return train, (self.test_y, neg)
  
    # train foramting
    def train_formating(self, dataframe):
        userId, probId, entry = [], [], []
        checked = set([tuple(x) for x in dataframe.values])

        neg_checked = set()

        for up in checked:
            u, p = up[0], up[1]
            # nonzero
            userId.append(u)
            probId.append(p)
            entry.append(1)

            # zero: negative sampling
            userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)
        return userId, probId, entry

    def get_negative_sampling(self, dataframe):
        
        totalProb = set(self.problems['problemId'].tolist())
        dataframe['problemId'] = dataframe['problemId'].apply(lambda x: literal_eval(x))
        neg = dataframe['problemId'].apply(lambda x: list(totalProb-set(x)))
        df_neg = pd.concat([dataframe['handle'], neg], axis=1)
        
        '''userId, probId, entry = [], [], []
        checked = set([tuple(x) for x in dataframe.values])
        neg_checked = set()
        for up in checked:
            u = up[0]
            # zero: negative sampling
            userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)
        df_neg = pd.DataFrame(list(zip(userId, probId)), columns = ['handle', 'problemId'])'''
        return df_neg

    # negative sampling
    def negative_sampling(self, u, checked, neg_checked, user, prob, entry):
        for n in range(self.neg_no):
            flag = False
            for t in range(20):
                p = np.random.randint(self.prob_no)
                if (u,p) not in checked and (u,p) not in neg_checked:
                    neg_checked.add((u,p))
                    flag = True
                    break
            if flag:
                user.append(u)
                prob.append(p)
                entry.append(0)
                
        return user, prob, entry

In [35]:
DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/data/preprocessed/'
loader1 = Loader(DIR, 1, 4)
loader1.load_dataset()

(24032, 2661)

In [10]:
train = loader1.formatting(0)

In [36]:
valid_tr, valid_te  = loader1.formatting(1)
valid_tr_X, valid_tr_y = valid_tr
valid_te_y, valid_te_X = valid_te
test_tr, test_te  = loader1.formatting(2)
test_tr_X, test_tr_y = test_tr
test_te_y, test_te_X = test_te

In [39]:
valid_tr

Unnamed: 0,handle,problemId
0,kkjh9909,"[16430, 8393, 10718, 1000, 1001, 2557, 2558, 9..."
1,nureeee,"[10757, 20492, 18108, 8393, 1000, 2557, 20499,..."
2,rkwkrkwk1029,"[10171, 10172, 10430, 8393, 10718, 1000, 1001,..."
3,gksdudrms0,"[8393, 10869, 2557, 10430, 2753, 1008, 9498, 2..."
4,kbm12,"[1000, 1001, 10171, 10172, 2557, 10718, 2739]"
...,...,...
2398,winfinity,"[1000, 1001, 10998, 10171, 2557, 10718, 1008, ..."
2399,whitewater22,"[18825, 1000]"
2400,heygwangjin,"[18108, 10718, 10926, 2557, 10430, 2525, 2480,..."
2401,nav7latte,"[1550, 6749, 7287, 8370, 8393, 5337, 5338, 533..."


In [None]:
train_usr, train_prb, train_entry = shuffle(train[0], train[1], train[2])
train_usr = np.array(train_usr).reshape(-1,1)
train_prb = np.array(train_prb).reshape(-1,1)
train_entry = np.array(train_entry).reshape(-1,1)

##1-2. 모델 학습

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

class MF(tf.keras.Model):
    def __init__(self, user_no, prob_no, K=4):
        super(MF, self).__init__()
        # 변수
        self.user_no = user_no
        self.prob_no = prob_no
        self.K = K

        # 레이어
        input_user = tf.keras.layers.Input(shape=(1,), dtype='int32') # 사용자 index 
        input_prob =  tf.keras.layers.Input(shape=(1,), dtype='int32') # 문제 index 
        embedding_user = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(user_no, K)(input_user)) # [사용자 index, 잠재요인]
        embedding_prob =  tf.keras.layers.Flatten()(tf.keras.layers.Embedding(prob_no, K)(input_prob)) # [문제 index, 잠재요인]
        matmul =  tf.keras.layers.Multiply()([embedding_user, embedding_prob]) # new [사용자 index, 문제 index]
        output =  tf.keras.layers.Dense(1)(matmul)

        # 모델
        self.model = tf.keras.Model(inputs=[input_user, input_prob], outputs=output)

    def get_model(self):
        return self.model

    def save_model(self, DIR):
        self.model.save(DIR)
        
    def level_filtering(self, dataframe, userlevel_map, problevel_map, k):
  
        user = dataframe['handle'].to_numpy()
        prob = dataframe['problemId'].to_numpy()
        pred = dataframe['pred'].to_numpy()
        
        limit = min(k*10, len(pred))
        idx = np.argpartition(-pred, limit)[:limit]
        
        candidates = dataframe.iloc[idx]
        print(candidates)
        problevel = candidates['problemId'].apply(lambda x: problevel_map[x]).to_numpy()
        maxlevel = candidates['handle'].apply(lambda x: userlevel_map[x]).to_numpy()
        lam = np.mean(candidates['pred'].to_numpy())/100
        dist = np.abs(problevel-maxlevel)*lam
        candidates['pred']+=dist
          
        top_idx = np.argsort(-candidates['pred'].to_numpy())[:k]
        top_k_data = candidates.iloc[top_idx]
        
        return top_k_data

In [None]:
import numpy as np

def recall_at_k(X_pred, heldout, k=100):
    idx = np.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(X_pred.shape[0])[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (heldout > 0)

    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall
  
def hit_rate_at_k(X_pred, heldout, k=100):
    idx = np.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(X_pred.shape[0])[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (heldout > 0)

    tmp = np.logical_and(X_true_binary, X_pred_binary)
    hits = np.sum(tmp, axis=1)
    hits = np.count_nonzero(hits)
    return hits

In [None]:
#mf = mf.MF(loader1.users_no, loader1.prob_no)
mf = MF(loader1.users_no, loader1.prob_no)
model = mf.get_model()

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
model.compile(optimizer=optimizer, loss='mse')

In [None]:
#train_N = len(train_usr)-1
#valid_N = len(valid_X)-1
valid_N = len(set(valid_X['handle']))-1
valid_id = np.array(list(set(valid_X['handle'])))
id2idx = {v[0]:i for i,v in enumerate(valid_X.values)}
epochs = 1
batch_size = 32
rc_vad = []
hr_vad = []
best_eval = -1
best_epoch = -1

MODEL_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/Recommendation/model/MF/best_model'

In [None]:
model = tf.keras.models.load_model(MODEL_DIR)

In [None]:
for epoch in range(epochs):
  '''for i in range(0, train_N, batch_size):
    idxlist = range(i, min(i+batch_size, train_N))
    model.fit([train_usr[idxlist], train_prb[idxlist]], train_entry[idxlist],verbose=0)'''
  
  hit_rate = 0
  recall = []
  for i in range(0, valid_N, batch_size):
    idxlist = range(i, min(i+batch_size, valid_N))
    valid_batch = valid_id[idxlist]
    print(len(valid_batch))
    if len(valid_batch) <= 0:
      continue
    valid_batch = valid_X.iloc[loader1.get_idx(valid_X, valid_batch)]
    print(valid_batch)
    
    valid_X_usr = np.array(valid_batch['handle'].tolist()).reshape(-1,1)
    valid_X_prb = np.array(valid_batch['problemId'].tolist()).reshape(-1,1)

    X_pred = model.predict([valid_X_usr, valid_X_prb])

    valid_batch['pred'] = X_pred
    filtered = mf.level_filtering(valid_batch, loader1.user2level, loader1.prob2level, 30)
    print(filtered)

    pred = sparse.csr_matrix((filtered['pred'], \
                             (filtered['handle'], filtered['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

    valid_y_batch = valid_y.iloc[list(set(loader1.get_idx(valid_y, valid_batch['handle'].tolist())))]
    print(valid_y_batch)
    heldout = sparse.csr_matrix((np.ones_like(valid_y_batch['handle']), \
                             (valid_y_batch['handle'], valid_y_batch['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

    recall.append(me.recall_at_k(pred, heldout, k=30))
    hit_rate += me.hit_rate_at_k(pred, heldout, k=30)
    break

  recall_ = np.concatenate(recall)
  recall_[np.isnan(recall_)]=0 
  recall_ = recall_.mean()
  rc_vad.append(recall_)
  print("epoch[", epoch, "] recall: ", recall_)

  hit_rate_ = hit_rate/loader1.users_no
  hr_vad.append(hit_rate_)
  print("epoch[", epoch, "] hit rate: ", hit_rate_)

  '''if hit_rate_ > best_eval:
    model.save(MODEL_DIR)
    best_epoch = epoch
    best_eval = hit_rate_
    print(epoch, best_eval)'''

32
       handle  problemId
0        8194          0
1        8194          1
2        8194          2
3        8194          3
4        8194          4
...       ...        ...
84855   16538       2656
84856   16538       2657
84857   16538       2658
84858   16538       2659
84859   16538       2660

[84860 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       handle  problemId      pred
30246    8255       1101  0.106815
40839    8287       1084  0.106654
40838    8287       1083  0.106453
40152    8287        394  0.106620
6465     8214       1164  0.106634
...       ...        ...       ...
42409    8287       2658  0.106350
5510     8214        204  0.106359
62386   16510       1408  0.106359
62497   16510       1520  0.106356
29665    8255        518  0.106360

[300 rows x 3 columns]
       handle  problemId      pred
31212    8255       2073  0.117341
30799    8255       1660  0.117221
31184    8255       2045  0.117188
30318    8255       1173  0.117147
31502    8255       2363  0.115114
41688    8287       1937  0.115108
29916    8255        770  0.115103
41304    8287       1551  0.115038
10238   16409       2298  0.114977
31471    8255       2332  0.114899
29786    8255        640  0.114892
42409    8287       2658  0.114870
30065    8255        919  0.114340
65287   16509       1660  0.114257
29494    8255        346  0.114

  recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))


epoch[ 0 ] recall:  3.906827933126202e-05
epoch[ 0 ] hit rate:  0.0002496671105193076


In [None]:
  recall_ = np.concatenate(recall)
  recall_[np.isnan(recall_)]=0 
  recall_ = recall_.mean()
  rc_vad.append(recall_)
  print("epoch[", epoch, "] recall: ", recall_)

  hit_rate_ = hit_rate/loader1.users_no
  hr_vad.append(hit_rate_)
  print("epoch[", epoch, "] hit rate: ", hit_rate_)

epoch[ 0 ] recall:  3.906827933126202e-05
epoch[ 0 ] hit rate:  0.0002496671105193076


# 1-3. test

In [None]:
test_N = len(test_X)-1
hit_rate = 0
recall = []

for i in range(0, test_N, batch_size):
  idxlist = range(i, min(i+batch_size, valid_N))
  test_batch = test_X.iloc[loader1.get_idx(test_X, idxlist)]
  if len(test_batch) <= 0:
    continue

  test_X_usr = np.array(test_batch['handle'].tolist()).reshape(-1,1)
  test_X_prb = np.array(test_batch['problemId'].tolist()).reshape(-1,1)

  X_pred = model.predict([test_X_usr, test_X_prb])
  test_batch['pred'] = X_pred
  filtered = mf.level_filtering(test_batch, loader1.user2level, loader1.prob2level, 30)

  pred = sparse.csr_matrix((filtered['pred'], \
                             (filtered['handle'], filtered['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

  test_y_batch = test_y.iloc[loader1.get_idx(test_y, test_batch['handle'].tolist())]

  heldout = sparse.csr_matrix((np.ones_like(test_y_batch['handle']), \
                             (test_y_batch['handle'], test_y_batch['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

  recall.append(me.recall_at_k(pred, heldout, k=30))
  hit_rate += me.hit_rate_at_k(pred, heldout, k=30)

recall_ = np.concatenate(recall)
recall_[np.isnan(recall_)]=0 
recall_ = recall_.mean()
rc_vad.append(recall_)
print("test recall: ", recall_)

hit_rate_ = hit_rate/loader1.users_no
hr_vad.append(hit_rate_)
print("test hit rate: ", hit_rate_)

# 2. 결과

In [None]:
import Result as rs # 경로 추가

if __name__ == '__main__':
  MODEL_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/Recommendation/model/MF/best_model' # 경로 변경
  DATA_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/data' # 경로 변경
  result = rs.Result(MODEL_DIR, DATA_DIR, "1", 1024)
  print(result.get_result("beoms"))

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import ast

class Result:
    def __init__(self, MODEL_DIR, DATASET_DIR, data_no, batch_size):
        self.model = tf.keras.models.load_model(MODEL_DIR)
        
        users = pd.read_csv(os.path.join(DATASET_DIR+'/preprocessed/data1', 'd'+data_no+'_users.csv'))
        problems =  pd.read_csv(os.path.join(DATASET_DIR+'/preprocessed/data1', 'd'+data_no+'_problems.csv'))
        solvedProblem = pd.read_csv(os.path.join(DATASET_DIR+'/raw_data', 'solvedProblem.csv'))

        self.N = len(problems['problemId'])
        self.id2idx_usr = {u[0]:i for i, u in enumerate(users.values)}
        self.id2prblist = {u[1]:u[3] for u in solvedProblem.values}
        self.idx2id_prb = {i:p[0] for i, p in enumerate(problems.values)}
        self.id2idx_prb = {p[0]:i for i, p in enumerate(problems.values)}
        self.batch_size = batch_size
  
    def get_result(self, id):
        entry = []
        probs = list(set(range(0, self.N)))
        for i in range(0, self.N, self.batch_size):
            idxlist = probs[i:min(i+self.batch_size, self.N)]

            input_p = np.array(idxlist).reshape(-1, 1)
            input_u = np.array([self.id2idx_usr[id]]*len(idxlist)).reshape(-1, 1)

            entry.append(self.model.predict([input_u, input_p]))
        # 필터링 추가
        
        entry = np.concatenate(entry)
        entry = np.array(entry).reshape(1, -1)[0]
        pos = list(set(ast.literal_eval(self.id2prblist[id])))
        ids = [self.id2idx_prb[id] for id in pos]
        entry[ids] = -np.inf
        top_idx = np.argpartition(-entry, 30)
        return [self.idx2id_prb[i] for i in top_idx]

In [None]:
MODEL_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/Recommendation/model/MF/best_model'
DATA_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/data'
result = Result(MODEL_DIR, DATA_DIR, "1", 1024)

In [None]:
print(result.get_result("beoms"))

[1612, 19759, 21735, 20413, 20733, 9469, 2476, 14173, 20001, 2720, 2410, 4176, 22279, 11448, 1890, 1544, 9558, 14445, 14496, 15592, 2529, 15890, 6550, 9947, 9950, 2525, 20186, 10353, 20301, 9536, 4084, 1654, 23901, 1100, 15887, 1620, 1924, 14645, 2845, 14606, 1182, 16113, 24498, 17203, 18787, 4883, 15662, 13416, 16162, 15810, 11660, 9711, 23365, 9228, 16546, 15802, 4806, 11724, 1977, 8674, 16189, 10799, 15973, 15178, 6588, 21921, 17124, 6811, 10093, 10156, 4411, 16489, 6122, 10932, 10539, 23758, 2605, 9575, 19939, 2628, 18222, 18156, 1302, 1303, 16175, 10598, 5338, 11265, 17626, 19945, 9290, 2082, 10703, 11501, 12525, 2052, 10469, 15881, 16478, 21623, 9325, 12845, 18268, 10808, 20360, 11759, 4659, 2294, 9550, 14494, 4118, 1965, 5554, 21301, 7583, 9625, 10434, 24542, 22966, 15680, 1942, 4706, 5176, 13118, 1439, 10818, 19771, 4335, 3010, 9329, 11663, 9948, 9094, 14183, 8393, 24544, 9664, 23925, 15803, 2711, 10410, 5671, 16430, 5074, 18229, 15048, 10845, 18111, 1535, 13229, 20540, 9414, 1

# 3. 난이도 필터링

In [None]:
user_level = {i:row[1] for i, row in enumerate(loader1.users.values)}
prob_level = {i:row[1] for i, row in enumerate(loader1.problems.values)}

In [None]:
def level_filtering(self, dataframe, userlevel_map, problevel_map, k):
  
        user = dataframe['handle'].to_numpy()
        prob = dataframe['problemId'].to_numpy()
        pred = dataframe['pred'].to_numpy()
        
        limit = min(k*4, len(pred))
        idx = np.argpartition(-pred, limit)[:limit]
        
        candidates = dataframe.iloc[idx]
        problevel = candidates['problemId'].apply(lambda x: problevel_map[x]).to_numpy()
        maxlevel = candidates['handle'].apply(lambda x: userlevel_map[x]).to_numpy()
        dist = np.abs(problevel-maxlevel)
          
        top_idx = np.argpartition(dist, k)[:limit]
        top_k_data = dataframe.iloc[top_idx]
        
        return top_k_data

In [None]:
for i in loader1.train['handle'].values:
  print(i)
  break

In [None]:
train_N = len(train_usr)-1
valid_N = len(valid_X)-1
epochs = 1
batch_size = 1024
rc_vad = []
hr_vad = []
best_eval = -1
best_epoch = -1

MODEL_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/Recommendation/model/MF/best_model'
model = tf.keras.models.load_model(MODEL_DIR)

In [None]:
for epoch in range(epochs):
  hit_rate = 0
  recall = []
  for i in range(0, valid_N, batch_size):
    print(i)
    idxlist = range(i, min(i+batch_size, valid_N))
    valid_batch = valid_X.iloc[loader1.get_idx(valid_X, idxlist)]
    if len(valid_batch) <= 0:
      continue

    valid_X_usr = np.array(valid_batch['handle'].tolist()).reshape(-1,1)
    valid_X_prb = np.array(valid_batch['problemId'].tolist()).reshape(-1,1)

    X_pred = model.predict([valid_X_usr, valid_X_prb])
    valid_batch['pred'] = X_pred
    filtered = level_filtering(valid_batch, user_level, prob_level, 30)

    pred = sparse.csr_matrix((filtered['pred'], \
                             (filtered['handle'], filtered['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

    valid_y_batch = valid_y[0].iloc[loader1.get_idx(valid_y[0], valid_batch['handle'].tolist())]
    heldout = sparse.csr_matrix((np.ones_like(valid_y_batch['handle']), \
                             (valid_y_batch['handle'], valid_y_batch['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

    recall.append(me.recall_at_k(pred, heldout, k=30))
    hit_rate += me.hit_rate_at_k(pred, heldout, k=30)

  recall_ = np.concatenate(recall)
  recall_[np.isnan(recall_)]=0 
  recall_ = recall_.mean()
  rc_vad.append(recall_)
  print(epoch, recall_)

  hit_rate_ = hit_rate/loader1.users_no
  hr_vad.append(hit_rate_)
  print(epoch, hit_rate_)

  if hit_rate_ > best_eval:
    model.save(MODEL_DIR)
    best_epoch = epoch
    best_eval = hit_rate_
    print(epoch, best_eval)

In [None]:
prob_level[2993]