<a href="https://colab.research.google.com/github/CP2J/cp2j/blob/ACJ-9-MF-SGD-/Recsys_MF_SGD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # MF-SGD (Matrix Factorization - Stochastic Gradient Descent)

 SVD에서 쓰인 행렬분해를 이용, 확률적 경사하강 기법으로 오차를 줄이는 방향으로 학습한다.  
SVD의 행렬분해에서는 null값이 존재하면 안되기에 평균값, 최빈값 등을 사용했으나 여기서는 랜덤값 지정 후 오차를 줄이는 방향으로 학습.  
결국 데이터가 sparse 할 수록 임의값에 의존하던 이전 모델들에 비해 성능이 더 잘 나오게 된다.

In [1]:
import pandas as pd
import numpy as np
from google.colab import files
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

In [2]:
files.upload();

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving users.csv to users.csv


In [3]:
rating = pd.read_csv('ratings.csv')

In [4]:
rating.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


train시 embedding layer 필요, df.pivot으로 해결되는지 추후 실험 필요


train_test_split 수행

train data를 연속적 값으로 변환하는 목적(빠진 값이 있을 것, svd 함수 내 설명 참고)


In [5]:
rating = rating.drop(columns = ['timestamp'])

In [6]:
from sklearn.model_selection import train_test_split
rating_train, rating_test = train_test_split(rating, test_size = 0.2)

rating_train = rating_train.reset_index()[['user_id', 'item_id', 'rating']]
rating_test = rating_test.reset_index()[['user_id', 'item_id', 'rating']]

In [7]:
def encode_column(column):
  # 컬럼 을 연속 id로 인코딩
  # 열 내 고유값 키
  keys = column.unique()
  #enumerate = 리스트 내 넘버와 값
  key_to_id = {key:idx for idx, key in enumerate(keys)}
  return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [8]:
def encode_df(rating):
  #rating 데이터를 연속적인 user와 item id 로 재 배열
  #encode_column의 결과 값으로 들어온 df의 열 데이터를 바꿔서 반환
  item_ids, rating['item_id'], num_item = encode_column(rating['item_id'])
  user_ids, rating['user_id'], num_user = encode_column(rating['user_id'])
  return rating, num_user, num_item, user_ids, item_ids

In [9]:
rating_df, num_user, num_item, user_ids, item_ids = encode_df(rating_train)
print("Number of Users : ", num_user)
print("Number of Items : ", num_item)
rating_df.head()

Number of Users :  943
Number of Items :  1644


Unnamed: 0,user_id,item_id,rating
0,0,0,4
1,1,1,5
2,2,2,4
3,3,3,2
4,4,4,4


User and Item embeddings

In [10]:
def create_embeddings(n, K):
  # 랜덤한 값의 넘파이 행력 생성 함수 (n, K)
  # n = 아이템/유저의 수
  # K = embedding 안의 고유값 개수
  return 5* np.random.random((n, K)) / K

In [11]:
def create_sparse_matrix(df, rows, columns, column_name = 'rating'):
  # scipy를 이용해 Sparse utility matrix 생성 함수
  return sparse.csc_matrix((df[column_name].values, (df['user_id'].values, df['item_id'].values)),shape = (rows, columns))

In [12]:
rating_df, num_user, num_item, user_ids, item_ids = encode_df(rating_train)
Y = create_sparse_matrix(rating_df, num_user, num_item)

In [13]:
Y.todense()

matrix([[4, 0, 0, ..., 0, 0, 0],
        [5, 5, 0, ..., 0, 0, 0],
        [2, 0, 4, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

예측값 함수

In [14]:
def predict(df, emb_user, emb_item):
  # 행렬곱(U * V^T) 없이 예측값을 돌려주는 함수
  # embedding 끼리의 같은 위치에 있는 값들의 곱(elementwise multiplication)의 합으로 u_i* v_j의 합을 구함
  # 이걸로 U * V^T를 위한 행렬 생성이 필요없게 된다
  df['prediction'] = np.sum(np.multiply(emb_item[df['item_id']], emb_user[df['user_id']]), axis = 1)
  return df

비용함수

In [15]:
lmbda = 0.0002

In [16]:
def cost(df, emb_user, emb_item) :
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
  return np.sqrt(np.sum((Y-predicted).power(2))/df.shape[0])

Gradient Descent(경사하강)

In [17]:
def gradient(df, emb_user, emb_item):
  # embedding에 적용할 경사 설정
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
  delta = (Y-predicted)
  grad_user = (-2/df.shape[0])*(delta*emb_item) + 2*lmbda*emb_user
  grad_item = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_item
  return grad_user, grad_item

In [18]:
def gradient_descent(df, emb_user, emb_item, iterations = 2000, learning_rate=0.01, df_val = None):
  Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
  beta = 0.9
  grad_user, grad_item = gradient(df, emb_user, emb_item)
  v_user = grad_user
  v_item = grad_item
  for i in range(iterations):
    grad_user, grad_item = gradient(df, emb_user, emb_item)
    v_user = beta*v_user + (1-beta)*grad_user
    v_item = beta*v_item + (1-beta)*grad_item
    emb_user = emb_user - learning_rate*v_user
    emb_item = emb_item - learning_rate*v_item
    if (i+1) % 50 == 0:
      print('\niteration', i+1, ":")
      print("train rmse : ", cost(df, emb_user, emb_item))
      if df_val is not None:
        print('validation rmse : ', cost(df_val, emb_user, emb_item))
  return emb_user, emb_item



In [19]:
emb_user = create_embeddings(num_user, 3)
emb_item = create_embeddings(num_item, 3)
emb_user, emb_item = gradient_descent(rating_df, emb_user, emb_item, iterations = 3000, learning_rate = 0.02, df_val = rating_test)


iteration 50 :
train rmse :  2.1305402038198706


ValueError: ignored

In [None]:
def encode_new_data(val_df, user_ids, item_ids):
  val_df_chosen = val_df['item_id'].isin(item_ids.keys())&val_df['user_id'].isin(user_ids.keys())
  val_df = val_df[val_df_chosen]
  val_df['user_id'] = np.array([user_ids[x] for x in val_df['user_id']])
  val_df['item_id'] = np.array([item_ids[x] for x in val_df['item_id']])
  return val_df


In [None]:
print('before encoding :', rating_test.shape)
rating_test = encode_new_data(rating_test, user_ids, item_ids)
print('after encoding :', rating_test.shape)

In [None]:
train_rmse = cost(rating_df, emb_user, emb_item)
val_rmse = cost(rating_test, emb_user, emb_item)
print(train_rmse, val_rmse)

In [None]:
rating_test

Unnamed: 0,user_id,item_id,rating
89603,92,239,4
61260,457,193,5
35690,144,15,4
36326,402,455,3
10395,13,154,5
...,...,...,...
77011,772,322,4
84090,224,237,3
23792,497,227,2
14044,222,276,5


In [None]:
emb_user = create_embeddings(num_user, 3)
emb_item = create_embeddings(num_item, 3)
emb_user, emb_item = gradient_descent(rating_df, emb_user, emb_item, iterations = 5000, learning_rate = 0.2, df_val = rating_test)


iteration 50 :
train rmse :  1.922422954001111
validation rmse :  2.0310788591239994

iteration 100 :
train rmse :  1.7849806695069734
validation rmse :  1.9613069731916049

iteration 150 :
train rmse :  1.6691089258402203
validation rmse :  1.9016843247239776

iteration 200 :
train rmse :  1.5728958280847174
validation rmse :  1.8511289652591216

iteration 250 :
train rmse :  1.493703308756523
validation rmse :  1.8083671614313865

iteration 300 :
train rmse :  1.4286820958512916
validation rmse :  1.772129820110171

iteration 350 :
train rmse :  1.3751485681593045
validation rmse :  1.7412707477429945

iteration 400 :
train rmse :  1.330784635201821
validation rmse :  1.7148160772322905

iteration 450 :
train rmse :  1.2936956515304419
validation rmse :  1.6919683762573063

iteration 500 :
train rmse :  1.2623841258172752
validation rmse :  1.6720877907920328

iteration 550 :
train rmse :  1.2356873921839806
validation rmse :  1.6546653579647583

iteration 600 :
train rmse :  1.2127

 # 0410 추가본 - 다른 레퍼런스 참고

 https://big-dream-world.tistory.com/69

1. 분해한 P, Q 행렬 임의값으로 생성
2. P 행렬, Q 전치행렬 곱해서 예측행렬 생성, 실제 R 행렬과 차이 계산(R 행렬 내 존재하는 실제값들과의 차이) 
3. 차이 줄이는 방향으로 P, Q 행렬 업데이트
4. 반복하며 근사화


  ** 상단 코드와의 차이  
1. train_test_split 안함 - train값으로 안본 영화 평점 예측 목적
2. 코드 간소화
3. 추후 수정

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
rating.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [23]:
rating_df = rating.pivot(index = 'user_id', columns = 'item_id', values = 'rating')
print(rating_df.shape)
rating_df.head()

(943, 1682)


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [25]:
num_user = rating_df.shape[0]
num_item = rating_df.shape[1]
print(num_user, num_item)

943 1682


In [None]:
non_zeros = [(i, j, rating_df.iloc[i, j]) for i in range(num_user) for j in range(num_item) if rating_df.iloc[i, j] > 0]
# user 위치, item 위치, rating 값 튜플로 묶어 리스트 내 저장
non_zeros

In [None]:
def get_rmse(real_mat, P, Q, non_zeros):
  # real_df = 실제 유저 아이템 행렬
  # P, Q = 잠재요인, user와 item으로 분해된 잠재행렬, 이걸로 예측 행렬 생성
  # non_null = real_df 내 null값 아니었던 것(real_df 만으로 함수 내 해결할 수 있을것으로 보이나, 저장해서 반복 계산 시 효율성 재고)
  error = 0
  pred_mat = np.dot(P, Q.T)
  user_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
  item_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
  