 # MF-SGD (Matrix Factorization - Stochastic Gradient Descent)

 SVD에서 쓰인 행렬분해를 이용, 확률적 경사하강 기법으로 오차를 줄이는 방향으로 학습한다.  
SVD의 행렬분해에서는 null값이 존재하면 안되기에 평균값, 최빈값 등을 사용했으나 여기서는 랜덤값 지정 후 오차를 줄이는 방향으로 학습.  
결국 데이터가 sparse 할 수록 임의값에 의존하던 이전 모델들에 비해 성능이 더 잘 나오게 된다.

In [1]:
import pandas as pd
import numpy as np
from google.colab import files
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

In [2]:
files.upload();

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving users.csv to users.csv


In [28]:
rating = pd.read_csv('ratings.csv')

In [29]:
rating.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


train시 embedding layer 필요, df.pivot으로 해결되는지 추후 실험 필요


train_test_split 수행

train data를 연속적 값으로 변환하는 목적(빠진 값이 있을 것, svd 함수 내 설명 참고)


In [30]:
rating = rating.drop(columns = ['timestamp'])

In [49]:
from sklearn.model_selection import train_test_split
rating_train, rating_test = train_test_split(rating, test_size = 0.2)

rating_train = rating_train.reset_index()[['user_id', 'item_id', 'rating']]
rating_test = rating_test.reset_index()[['user_id', 'item_id', 'rating']]

In [50]:
def encode_column(column):
  # 컬럼 을 연속 id로 인코딩
  # 열 내 고유값 키
  keys = column.unique()
  #enumerate = 리스트 내 넘버와 값
  key_to_id = {key:idx for idx, key in enumerate(keys)}
  return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [51]:
def encode_df(rating):
  #rating 데이터를 연속적인 user와 item id 로 재 배열
  #encode_column의 결과 값으로 들어온 df의 열 데이터를 바꿔서 반환
  item_ids, rating['item_id'], num_item = encode_column(rating['item_id'])
  user_ids, rating['user_id'], num_user = encode_column(rating['user_id'])
  return rating, num_user, num_item, user_ids, item_ids

In [52]:
rating_df, num_user, num_item, user_ids, item_ids = encode_df(rating_train)
print("Number of Users : ", num_user)
print("Number of Items : ", num_item)
rating_df.head()

Number of Users :  943
Number of Items :  1651


Unnamed: 0,user_id,item_id,rating
0,0,0,3
1,1,1,5
2,2,2,2
3,3,3,4
4,4,4,5


User and Item embeddings

In [53]:
def create_embeddings(n, K):
  # 랜덤한 값의 넘파이 행력 생성 함수 (n, K)
  # n = 아이템/유저의 수
  # K = embedding 안의 고유값 개수
  return 5* np.random.random((n, K)) / K

In [54]:
def create_sparse_matrix(df, rows, columns, column_name = 'rating'):
  # scipy를 이용해 Sparse utility matrix 생성 함수
  return sparse.csc_matrix((df[column_name].values, (df['user_id'].values, df['item_id'].values)),shape = (rows, columns))

In [55]:
rating_df, num_user, num_item, user_ids, item_ids = encode_df(rating_train)
Y = create_sparse_matrix(rating_df, num_user, num_item)

In [56]:
Y.todense()

matrix([[3, 0, 0, ..., 0, 0, 0],
        [0, 5, 0, ..., 0, 0, 0],
        [0, 0, 2, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

예측값 함수

In [57]:
def predict(df, emb_user, emb_item):
  # 행렬곱(U * V^T) 없이 예측값을 돌려주는 함수
  # embedding 끼리의 같은 위치에 있는 값들의 곱(elementwise multiplication)의 합으로 u_i* v_j의 합을 구함
  # 이걸로 U * V^T를 위한 행렬 생성이 필요없게 된다
  df['prediction'] = np.sum(np.multiply(emb_item[df['item_id']], emb_user[df['user_id']]), axis = 1)
  return df

비용함수

In [58]:
lmbda = 0.0002

In [74]:
def cost(df, emb_user, emb_item) :
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
  return np.sqrt(np.sum((Y-predicted).power(2))/df.shape[0])

Gradient Descent(경사하강)

In [75]:
def gradient(df, emb_user, emb_item):
  # embedding에 적용할 경사 설정
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
  delta = (Y-predicted)
  grad_user = (-2/df.shape[0])*(delta*emb_item) + 2*lmbda*emb_user
  grad_item = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_item
  return grad_user, grad_item

In [76]:
def gradient_descent(df, emb_user, emb_item, iterations = 2000, learning_rate=0.01, df_val = None):
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  beta = 0.9
  grad_user, grad_item = gradient(df, emb_user, emb_item)
  v_user = grad_user
  v_item = grad_item
  for i in range(iterations):
    grad_user, grad_item = gradient(df, emb_user, emb_item)
    v_user = beta*v_user + (1-beta)*grad_user
    v_item = beta*v_item + (1-beta)*grad_item
    emb_user = emb_user - learning_rate*v_user
    emb_item = emb_item - learning_rate*v_item
    if (i+1) % 50 == 0:
      print('\niteration', i+1, ":")
      print("train rmse : ", cost(df, emb_user, emb_item))
      if df_val is not None:
        print('validation rmse : ', cost(df_val, emb_user, emb_item))
  return emb_user, emb_item



In [77]:
emb_user = create_embeddings(num_user, 3)
emb_item = create_embeddings(num_item, 3)
emb_user, emb_item = gradient_descent(rating_df, emb_user, emb_item, learning_rate = 0.02)


iteration 50 :
train rmse :  2.1287788629884874

iteration 100 :
train rmse :  2.1115011012611005

iteration 150 :
train rmse :  2.0944499349009438

iteration 200 :
train rmse :  2.07762514179533

iteration 250 :
train rmse :  2.0610264569873795

iteration 300 :
train rmse :  2.0446535961830694

iteration 350 :
train rmse :  2.0285062454680176

iteration 400 :
train rmse :  2.012584051870826

iteration 450 :
train rmse :  1.9968866149078246

iteration 500 :
train rmse :  1.9814134790660167

iteration 550 :
train rmse :  1.9661641271812509

iteration 600 :
train rmse :  1.951137974669613

iteration 650 :
train rmse :  1.9363343645708524

iteration 700 :
train rmse :  1.9217525633633807

iteration 750 :
train rmse :  1.9073917575110106

iteration 800 :
train rmse :  1.893251050702173

iteration 850 :
train rmse :  1.8793294617429053

iteration 900 :
train rmse :  1.8656259230654029

iteration 950 :
train rmse :  1.8521392798144796

iteration 1000 :
train rmse :  1.8388682894747876

iter

In [67]:
def encode_new_data(val_df, user_ids, item_ids):
  val_df_chosen = val_df['item_id'].isin(item_ids.keys())&val_df['user_id'].isin(user_ids.keys())
  val_df = val_df[val_df_chosen]
  val_df['user_id'] = np.array([user_ids[x] for x in val_df['user_id']])
  val_df['item_id'] = np.array([item_ids[x] for x in val_df['item_id']])
  return val_df


In [68]:
print('before encoding :', rating_test.shape)
rating_test = encode_new_data(rating_test, user_ids, item_ids)
print('after encoding :', rating_test.shape)

before encoding : (19958, 4)
after encoding : (19958, 4)


In [79]:
train_rmse = cost(rating_df, emb_user, emb_item)
val_rmse = cost(rating_test, emb_user, emb_item)
print(train_rmse, val_rmse)

1.6161028583886232 1.8396235580461877


In [48]:
rating_test

Unnamed: 0,user_id,item_id,rating
89603,92,239,4
61260,457,193,5
35690,144,15,4
36326,402,455,3
10395,13,154,5
...,...,...,...
77011,772,322,4
84090,224,237,3
23792,497,227,2
14044,222,276,5
