In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
from scipy import sparse
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class Loader:
  def __init__(self, DIR, data_no, neg_no):
    self.DIR = DIR
    self.neg_no = neg_no
    self.data_no = data_no
  
  def load_dataset(self):
    data_no = str(self.data_no)
    self.DATASET_DIR = self.DIR + 'data' + data_no

    # 데이터 로드
    self.train = pd.read_csv(os.path.join(self.DATASET_DIR, 'd'+data_no+'_train.csv'))
    self.valid_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_X.csv'))
    self.valid_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_y.csv'))
    self.test_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_X.csv'))
    self.test_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_y.csv'))
    self.users = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_users.csv'))
    self.problems = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_problems.csv'))
    self.users_no = len(self.users)
    self.prob_no = len(self.problems)
    return self.users_no, self.prob_no
    
    
  def formatting(self, case):
    # 모델 input 형식에 맞게 바꾸기
    # case: 0 train
    if case == 0:
      train = self.train_formating(self.train)
      return train
    elif case == 1: # valid
      #valid_X = self.test_formating(self.valid_X)
      neg = self.get_negative_sampling(self.valid_y)
      return self.valid_X, (self.valid_y, neg)
    else: # test
      #test_X = self.test_formating(self.test_X)
      neg = self.get_negative_sampling(self.test_y)
      return self.test_X, (self.test_y, neg)
  
  def idx_to_id(self, idx, dataframe):
    return dataframe.iloc[idx, 0]

  # train foramting
  def train_formating(self, dataframe):
    userId, probId, entry = [], [], []
    checked = set([tuple(x) for x in dataframe.values])
    print(len(checked))
    neg_checked = set()

    for up in checked:
      u, p = up[0], up[1]
      # nonzero
      userId.append(u)
      probId.append(p)
      entry.append(1)

      # zero: negative sampling
      userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)

    return userId, probId, entry
  

  def get_negative_sampling(self, dataframe):
    userId, probId, entry = [], [], []
    checked = set([tuple(x) for x in dataframe.values])
    neg_checked = set()
    for up in checked:
      u = up[0]
      # zero: negative sampling
      userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)
    
    df_neg = pd.DataFrame(list(zip(userId, probId)))

    return df_neg

  # negative sampling
  def negative_sampling(self, u, checked, neg_checked, user, prob, entry):
    for n in range(self.neg_no):
      flag = False
      for t in range(20):
        p = np.random.randint(self.prob_no)
        if (u,p) not in checked and (u,p) not in neg_checked:
          neg_checked.add((u,p))
          flag = True
          break
      if flag:
        user.append(u)
        prob.append(p)
        entry.append(0)   
    return user, prob, entry

  def get_idx(self, dataframe, idxlist):
    idx = []
    for id in idxlist:
      for d in dataframe.index[dataframe['handle']==id].tolist():
        idx.append(d)
    return idx

In [3]:
class MF(tf.keras.Model):
  def __init__(self, user_no, prob_no, K=4):
    super(MF, self).__init__()
    # 변수
    self.user_no = user_no
    self.prob_no = prob_no
    self.K = K
    
    # 레이어
    input_user = tf.keras.layers.Input(shape=(1,), dtype='int32')
    input_prob =  tf.keras.layers.Input(shape=(1,), dtype='int32')
    embedding_user = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(user_no, K)(input_user))
    embedding_prob =  tf.keras.layers.Flatten()(tf.keras.layers.Embedding(prob_no, K)(input_prob))
    matmul =  tf.keras.layers.Multiply()([embedding_user, embedding_prob])
    output =  tf.keras.layers.Dense(1)(matmul)

    # 모델
    self.model = tf.keras.Model(inputs=[input_user, input_prob], outputs=output)

  def get_model(self):
    return self.model

  def save_model(self, DIR):
    self.model.save(DIR)

In [4]:
def recall_at_k(X_pred, heldout, k=100):
  idx = np.argpartition(-X_pred, k, axis=1)
  X_pred_binary = np.zeros_like(X_pred, dtype=bool)
  X_pred_binary[np.arange(X_pred.shape[0])[:, np.newaxis], idx[:, :k]] = True
  X_true_binary = (heldout > 0)

  tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
  recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
  return recall

In [16]:
def hit_rate_at_k(X_pred, heldout, k=100):
  idx = np.argpartition(-X_pred, k, axis=1)
  X_pred_binary = np.zeros_like(X_pred, dtype=bool)
  X_pred_binary[np.arange(X_pred.shape[0])[:, np.newaxis], idx[:, :k]] = True
  X_true_binary = (heldout > 0)

  tmp = np.logical_and(X_true_binary, X_pred_binary)
  hits = np.sum(tmp, axis=1)
  hits = np.count_nonzero(hits)
  return hits

In [5]:
'''def level_filtering(X, dataloader, k=500):
  # max_level과 가까운 문제들에 가중치 부여
  idx = np.argpartition(-X['pred'], k, axis=1)
  max_level = X.iloc[idx, 0]
  return'''

"def level_filtering(X, dataloader, k=500):\n  # max_level과 가까운 문제들에 가중치 부여\n  idx = np.argpartition(-X['pred'], k, axis=1)\n  max_level = X.iloc[idx, 0]\n  return"

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. 데이터 로드

In [7]:
DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/data/preprocessed/'
loader1 = Loader(DIR, 1, 4)
loader1.load_dataset()

(24032, 2661)

In [8]:
train = loader1.formatting(0)
valid_X, valid_y  = loader1.formatting(1)
test_X, test_y  = loader1.formatting(2)

664975


In [9]:
train_usr, train_prb, train_entry = shuffle(train[0], train[1], train[2])
train_usr = np.array(train_usr).reshape(-1,1)
train_prb = np.array(train_prb).reshape(-1,1)
train_entry = np.array(train_entry).reshape(-1,1)

In [10]:
'''valid_usr, valid_prb, valid_entry = valid_X
valid_usr = np.array(valid_usr).reshape(-1,1)
valid_prb = np.array(valid_prb).reshape(-1,1)
valid_entry = np.array(valid_entry).reshape(-1,1)'''

'valid_usr, valid_prb, valid_entry = valid_X\nvalid_usr = np.array(valid_usr).reshape(-1,1)\nvalid_prb = np.array(valid_prb).reshape(-1,1)\nvalid_entry = np.array(valid_entry).reshape(-1,1)'

In [11]:
"""valid_user_pos, valid_prob_pos, valid_user_neg, valid_prob_neg = valid_y
valid_user_pos = np.array(valid_user_pos).reshape(-1,1)
valid_prob_pos = np.array(valid_prob_pos).reshape(-1,1)
valid_user_neg = np.array(valid_user_neg).reshape(-1,1)
valid_prob_neg = np.array(valid_prob_neg).reshape(-1,1)"""

'valid_user_pos, valid_prob_pos, valid_user_neg, valid_prob_neg = valid_y\nvalid_user_pos = np.array(valid_user_pos).reshape(-1,1)\nvalid_prob_pos = np.array(valid_prob_pos).reshape(-1,1)\nvalid_user_neg = np.array(valid_user_neg).reshape(-1,1)\nvalid_prob_neg = np.array(valid_prob_neg).reshape(-1,1)'

#2. 모델

In [12]:
mf = MF(loader1.users_no, loader1.prob_no)
model = mf.get_model()

In [13]:
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
model.compile(optimizer=optimizer, loss='mse')

##2-1. 학습

In [22]:
train_N = len(train_usr)-1
valid_N = len(valid_X)-1
epochs = 1
batch_size = 1024
rc_vad = []
hr_vad = []
best_eval = -1
best_epoch = -1

MODEL_DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/Recommendation/model/MF/best_model'

In [42]:
for epoch in range(epochs):
  '''for i in range(0, train_N, batch_size):
    idxlist = range(i, min(i+batch_size, train_N))
    model.fit([train_usr[idxlist], train_prb[idxlist]], train_entry[idxlist],verbose=1)'''
  
  hit_rate = 0
  recall = []
  for i in range(0, valid_N, batch_size):
    idxlist = range(i, min(i+batch_size, valid_N))
    valid_batch = valid_X.iloc[loader1.get_idx(valid_X, idxlist)]
    if len(valid_batch) <= 0:
      continue

    valid_X_usr = np.array(valid_batch['handle'].tolist()).reshape(-1,1)
    valid_X_prb = np.array(valid_batch['problemId'].tolist()).reshape(-1,1)

    X_pred = model.predict([valid_X_usr, valid_X_prb])
    valid_batch['pred'] = X_pred

    valid_y_batch = valid_y[0].iloc[loader1.get_idx(valid_y[0], valid_batch['handle'].tolist())]

    pred = sparse.csr_matrix((valid_batch['pred'], \
                             (valid_batch['handle'], valid_batch['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()
    heldout = sparse.csr_matrix((np.ones_like(valid_y_batch['handle']), \
                             (valid_y_batch['handle'], valid_y_batch['problemId'])),\
                             dtype='float64', shape=(loader1.users_no, loader1.prob_no)).toarray()

    recall.append(recall_at_k(pred, heldout, k=20))
    hit_rate += hit_rate_at_k(pred, heldout, k=20)

  recall_ = np.concatenate(recall)
  recall_[np.isnan(recall_)]=0 
  recall_ = recall_.mean()
  rc_vad.append(recall_)
  print(epoch, recall_)

  hit_rate_ = hit_rate/loader1.users_no
  hr_vad.append(hit_rate_)
  print(epoch, hit_rate_)

  if recall_ > best_eval:
    model.save(MODEL_DIR)
    best_epoch = epoch
    best_eval = recall_
    print(epoch, best_eval)

[[   3]
 [   3]
 [   3]
 ...
 [1008]
 [1008]
 [1008]]


ValueError: ignored

In [92]:
class Result:
  def __init__(self, model, problems):
    self.model = model
    self.problems = problems
  
  def get_result(self, id):
    id = np.array(id).reshape(-1, 1)
    entry = []
    N = len(self.problems['problemId'])
    for i in range(0, N, batch_size):
      idxlist = range(i, min(i+batch_size, N))
      probs = np.array(idxlist).reshape(-1, 1)
      ids = np.array([id]*len(idxlist)).reshape(-1, 1)
      entry.append(model.predict([ids, probs]))

    entry = np.concatenate(entry)
    entry = np.array(entry).reshape(1, -1)
    ## 이미 푼 문제는 제외하고 리턴해야함--> 수정필요
    top_idx = np.argpartition(-entry, 50, axis=1)
    return self.problems.iloc[top_idx[0],0].values

In [93]:
result = Result(model, loader1.problems)
print(result.get_result(15616))

[10934 20361  1421 ...  1002 24751 10935]
