In [8]:
!which python

/Users/darrenkwon/miniconda3/envs/camp/bin/python


In [9]:
!python --version

Python 3.10.9


In [45]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

print(f"{np.__version__}")
print(f"{pd.__version__}")
print(f"{mpl.__version__}")

1.23.5
1.5.3
3.7.1


In [46]:
import scipy as sp
import sympy
import sklearn

sympy.init_printing(use_latex='mathjax') # Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함

print(f"{sklearn.__version__}")
print(f"{sympy.__version__}")
print(f"{sp.__version__}")

1.1.3
1.11.1
1.10.0


In [58]:
import os

base_src = "./data/"

# users
u_user_src = os.path.join(base_src, "u.user")
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(u_user_src, sep="|", names=u_cols, encoding="latin-1")

# items
u_item_src = os.path.join(base_src, "u.item")
i_cols = [
    "movie_id", "title", "release_date", "video_release_date", "imdb_url", 
    "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-fi", "Thriller", "War", "Western"
    ]
items = pd.read_csv(u_item_src, sep="|", names=i_cols, encoding="latin-1")

# ratings
u_data_src = os.path.join(base_src, "u.data")
u_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(u_data_src, sep="\t", names=u_cols, encoding="latin-1")

users.shape, items.shape, ratings.shape

((943, 5), (1682, 24), (100000, 4))

In [60]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   sex         943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [61]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings["rating"]

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.26, 
                                                    stratify=y)

In [62]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model):
    """
    RMSE(loss)를 반환하므로 작아야 좋은 것임
    """
    id_pairs:tuple[int, int] = zip(x_test["user_id"], x_test["movie_id"])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test["rating"])
    return RMSE(y_true, y_pred)

# train 데이터에서 movie_id의 rating 평균 df
train_mean = x_train.groupby("movie_id")["rating"].mean()

def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        # train 데이터에 없는 movie_id의 경우 indexError 발생하므로 여기로 빠질 것
        # 1~5점 척도이므로 평균 3점으로 설정
        rating = 3.0
    return rating

score(best_seller)

1.024070463298904

### 성별에 따른 예측값

In [68]:
merged_ratings = pd.merge(x_train, users, how="inner", on="user_id")
merged_ratings.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,age,sex,occupation,zip_code
23193,776,22,5,891628752,30,M,librarian,51157
32130,3,303,3,889236983,23,M,writer,32067
58594,214,180,5,892668130,26,F,librarian,11231
15302,193,1074,3,889126453,29,M,student,49931
12244,639,100,1,891240495,42,F,librarian,12065


In [None]:
users = users.set_index("user_id")

In [73]:
# 영화, 성별에 따른 평균 평점
g_mean = merged_ratings[["movie_id", "sex", "rating"]].groupby(["movie_id", "sex"])["rating"].mean()
g_mean

movie_id  sex
1         F      3.823529
          M      3.889831
2         F      3.571429
          M      3.194805
3         F      2.875000
                   ...   
1676      M      2.000000
1677      F      3.000000
1678      M      1.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3034, dtype: float64

In [75]:
rating_matrix = x_train.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1674,1675,1676,1677,1678,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,,5.0,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [81]:
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix.columns:
        gender = users.loc[user_id]["sex"]
        if gender in g_mean[movie_id].index: # 간혹 한 성별만 본 영화도 있어서.
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

In [83]:
score(cf_gender) # 오히려 더 의미 없음...

1.0326378364064017