In [28]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle

In [51]:
class Loader:
  def __init__(self, DIR, data_no, neg_no):
    self.DIR = DIR
    self.neg_no = neg_no
    self.data_no = data_no
  
  def load_dataset(self):
    data_no = str(self.data_no)
    self.DATASET_DIR = self.DIR + 'data' + data_no

    # 데이터 로드
    self.train = pd.read_csv(os.path.join(self.DATASET_DIR, 'd'+data_no+'_train.csv'))
    self.valid_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_X.csv'))
    self.valid_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_validation_y.csv'))
    self.test_X = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_X.csv'))
    self.test_y = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_test_y.csv'))
    self.users = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_users.csv'))
    self.problems = pd.read_csv(os.path.join(self.DATASET_DIR , 'd'+data_no+'_problems.csv'))
    self.users_no = len(self.users)
    self.prob_no = len(self.problems)
    return self.users_no, self.prob_no
    
    
  def formatting(self, case):
    # 모델 input 형식에 맞게 바꾸기
    # case: 0 train
    if case == 0:
      train = self.input_formating(self.train)
      return train
    elif case == 1: # valid
      valid_X = self.input_formating(self.valid_X)
      valid_y = self.eval_formating(self.valid_y)
      return valid_X, valid_y
    else: # test
      test_X = self.input_formating(self.test_X)
      test_y = self.eval_formating(self.test_y)
      return test_X, test_y
  
  def idx_to_id(self, idx, dataframe):
    return dataframe.iloc[idx, 0]

  # input foramting
  def input_formating(self, dataframe):
    userId, probId, entry = [], [], []
    checked = set([tuple(x) for x in dataframe.values])
    print(len(checked))
    neg_checked = set()

    for up in checked:
      u, p = up[0], up[1]
      # nonzero
      userId.append(u)
      probId.append(p)
      entry.append(1)

      # zero: negative sampling
      userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)

    return userId, probId, entry
  
  # for evaluation
  def eval_formating(self, dataframe):
    userId, probId, entry = [], [], []
    checked = set([tuple(x) for x in dataframe.values])
    neg_checked = set()
    for up in checked:
      u = up[0]
      # zero: negative sampling
      userId, probId, entry = self.negative_sampling(u, checked, neg_checked, userId, probId, entry)
    
    pos_userId = dataframe['handle'].tolist()
    pos_probId = dataframe['problemId'].tolist()

    # pos userId, pos probId, neg userId, neg probId
    return pos_userId, pos_probId, userId, probId

  # negative sampling
  def negative_sampling(self, u, checked, neg_checked, user, prob, entry):
    for n in range(self.neg_no):
      flag = False
      for t in range(20):
        p = np.random.randint(self.prob_no)
        if (u,p) not in checked and (u,p) not in neg_checked:
          neg_checked.add((u,p))
          flag = True
          break
      if flag:
        user.append(u)
        prob.append(p)
        entry.append(0)   
    return user, prob, entry

In [52]:
class MF(tf.keras.Model):
  def __init__(self, user_no, prob_no, K=4):
    super(MF, self).__init__()
    # 변수
    self.user_no = user_no
    self.prob_no = prob_no
    self.K = K
    
    # 레이어
    input_user = tf.keras.layers.Input(shape=(1,), dtype='int32')
    input_prob =  tf.keras.layers.Input(shape=(1,), dtype='int32')
    embedding_user = tf.keras.layers.Flatten()(tf.keras.layers.Embedding(user_no, K)(input_user))
    embedding_prob =  tf.keras.layers.Flatten()(tf.keras.layers.Embedding(prob_no, K)(input_prob))
    matmul =  tf.keras.layers.Multiply()([embedding_user, embedding_prob])
    output =  tf.keras.layers.Dense(1)(matmul)

    # 모델
    self.model = tf.keras.Model(inputs=[input_user, input_prob], outputs=output)
  
  def get_model(self):
    return self.model

  def save_model(self, DIR, model):
    model.save(DIR)

In [None]:
# recall
def recall_top_k

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
DIR = '/content/drive/MyDrive/(22-1)캡스톤/recomm/data/preprocessed/'
loader1 = Loader(DIR, 1, 4)
loader1.load_dataset()

(24032, 2661)

In [55]:
train = loader1.formatting(0)

664975


In [56]:
valid_X, valid_y  = loader1.formatting(1)
test_X, test_y  = loader1.formatting(2)

73933
73933


In [57]:
train_usr, train_prb, train_entry = shuffle(train[0], train[1], train[2])

In [58]:
train_usr = np.array(train_usr).reshape(-1,1)
train_prb = np.array(train_prb).reshape(-1,1)
train_entry = np.array(train_entry).reshape(-1,1)

In [69]:
mf = MF(loader1.users_no, loader1.prob_no)
model = mf.get_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
model.compile(optimizer=optimizer, loss='mse')
model.fit([train_usr, train_prb], train_entry, epochs=1, batch_size=1, verbose=1)

  17910/3321642 [..............................] - ETA: 1:16:02 - loss: 0.1595

KeyboardInterrupt: ignored

In [65]:
test_usr, test_prb, test_entry = test_X
test_usr = np.array(test_usr).reshape(-1,1)
test_prb = np.array(test_prb).reshape(-1,1)
test_entry = np.array(test_entry).reshape(-1,1)

In [67]:
prediction = model.predict([test_usr, test_prb])

In [68]:
prediction

array([[0.19900951],
       [0.19921537],
       [0.19933659],
       ...,
       [0.19934607],
       [0.19993792],
       [0.19905385]], dtype=float32)