<a href="https://colab.research.google.com/github/Boin-Kau/recommender-systems-using-python/blob/main/group_recommender_systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import os
import pandas as pd

base_src = 'drive/MyDrive/RecoSys/Data/recommender_systems_data'
# u.user 데이터 불러오기 
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src, 
                    sep='|', 
                    names=u_cols, 
                    encoding='latin-1')
# u.item 데이터 불러오기 
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id','title','release date','video release date',
'IMDB URL','unknown','Action','Adventure','Animat ion', 'Children\'s','Comedy','Crime','Documentary ','Drama','Fantasy',
'Film- Noir','Horror','Musical','Mystery','Romance ','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                     sep='|',
                     names=i_cols,
                     encoding='latin-1')

# u.data 데이터 불러오기
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                      sep='\t',
                      names=r_cols,
                      encoding='latin-1')

# ratings DataFrame에서 timestamp 제거
ratings = ratings.drop('timestamp', axis=1)
# movies DataFrame에서 movie_id와 title 데이터만 가져오기
movies = movies[['movie_id', 'title']]

In [42]:
# 데이터 train, test set 분리
from sklearn.model_selection import train_test_split
import numpy as np

x = ratings.copy()
y = ratings['user_id']

# training을 75%, testing을 25%로 split
# stratify 옵션을 통해 전체 데이터 셋의 분포를 반영하도록 층화추출
x_train,x_test,y_train,y_test = train_test_split(x,y,
                                                 test_size=0.25,
                                                 stratify=y)

# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true,y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 : testing data set을 사용
def score(model):
  id_pairs = zip(x_test['user_id'],x_test['movie_id'])
  y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true,y_pred)

# best-seller 함수를 이용한 정확도 계산 : training data set을 사용
train_mean = x_train.groupby(['movie_id'])['rating'].mean()
def best_seller(user_id,movie_id):
  try:
    rating = train_mean[movie_id]
  except:
    rating = 3.0
  return rating

score(best_seller)

1.030796645547913

In [59]:
# 두 DataFrame의 공통 Column(user_id)을 기준으로 병합
merged_ratings = pd.merge(x_train,users)
merged_ratings.head()

# 인덱스 설정
users = users.set_index('user_id')

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,422,201,4,26,M,entertainment,94533
1,422,286,5,26,M,entertainment,94533
2,422,458,3,26,M,entertainment,94533
3,422,217,3,26,M,entertainment,94533
4,422,250,5,26,M,entertainment,94533


In [76]:
# 성별에 따른 예측값 계산
g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

rating_matrix = x_train.pivot(index='user_id', # 행 위치에 들어갈 열
                              columns='movie_id', # 열 위치에 들어갈 열
                              values='rating') # 데이터로 사용할 열

In [78]:
# Gender 기준 추천
def cf_gender(user_id,movie_id):
  if movie_id in rating_matrix.columns:
    gender = users.loc[user_id]['sex']
    if gender in g_mean[movie_id].index:
      gender_rating = g_mean[movie_id][gender]
    else:
      gender_rating = 3.0
  else:
    gender_rating = 3.0
  return gender_rating

score(cf_gender)

1.0407507592820584