## Memory based Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

using https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset

In [2]:
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv("../data/ml-100k/u.user", sep="|", names=u_cols, encoding="latin-1")
users.set_index("user_id", inplace=True)

In [3]:
users

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [4]:
i_cols = [
    "movie_id",
    "title",
    "release date",
    "video release date",
    "IMDbURL",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

movies = pd.read_csv(
    "../data/ml-100k/u.item", sep="|", names=i_cols, encoding="latin-1"
)

movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDbURL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
movies = movies[["movie_id", "title"]]

In [6]:
r_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    "../data/ml-100k/u.data", sep="\t", names=r_cols, encoding="latin-1"
)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
ratings = ratings.drop("timestamp", axis=1)

In [8]:
X = ratings.copy()
y = ratings["user_id"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

In [10]:
def baseline(user_id, movie_id):
    return 3.0

In [11]:
def score(cf_model):
    y_pred = np.array(
        [
            cf_model(user, movie)
            for (user, movie) in zip(X_test["user_id"], X_test["movie_id"])
        ]
    )

    y_true = np.array(X_test["rating"])

    return mean_squared_error(y_true, y_pred, squared=True)

In [12]:
score(baseline)

1.55956

In [13]:
r_matrix = X_train.pivot_table(values="rating", index="user_id", columns="movie_id")
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [14]:
1 - np.isnan(r_matrix).sum().sum() / r_matrix.size

0.04846642536849022

## Using mean rating

In [15]:
def cf_mean(user_id, movie_id):
    return r_matrix[movie_id].mean() if movie_id in r_matrix.columns else 3.0

In [16]:
score(cf_mean)

1.0610699160960584

## Using weighted mean rating

In [17]:
cosine_sim = cosine_similarity(r_matrix.fillna(0), r_matrix.fillna(0))

In [18]:
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

In [19]:
def cf_weighted_mean(user_id, movie_id):
    score = np.nan

    if movie_id in r_matrix.columns and user_id in r_matrix.index:
        eps = 1e-10
        idx = r_matrix[movie_id].dropna().index
        user_sim = cosine_sim[user_id].loc[idx]
        score = np.dot(user_sim, r_matrix[movie_id][idx]) / (user_sim.sum() + eps)

    return score if not np.isnan(score) else 3.0

In [20]:
score(cf_weighted_mean)

1.0478847740931785

In [21]:
merged_df = pd.merge(
    X_train,
    users,
    left_on="user_id",
    right_index=True,
).reset_index(drop=True)

merged_df

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,862,416,3,25,M,executive,13820
2,862,1093,5,25,M,executive,13820
3,862,168,4,25,M,executive,13820
4,862,568,3,25,M,executive,13820
...,...,...,...,...,...,...,...
74995,107,340,5,39,M,scientist,60466
74996,107,321,2,39,M,scientist,60466
74997,107,271,2,39,M,scientist,60466
74998,107,322,1,39,M,scientist,60466


In [22]:
gender_mean = (
    merged_df[["movie_id", "sex", "rating"]]
    .groupby(["movie_id", "sex"])["rating"]
    .mean()
)

## Using weighted mean rating and user gender

In [23]:
def cf_weighted_mean_gender(user_id, movie_id):
    score = np.nan
    gender = users.loc[user_id].sex

    if (
        movie_id in r_matrix.columns
        and user_id in r_matrix.index
        and gender in gender_mean[movie_id].index
    ):
        score = gender_mean[movie_id][gender]

    return (
        score
        if not np.isnan(score)
        else r_matrix[movie_id].mean()
        if movie_id in r_matrix.columns
        else r_matrix.mean().mean()
    )

In [24]:
score(cf_weighted_mean_gender)

1.0824333023843344

## Using weighted mean rating, user gender and occupation

In [25]:
gen_occ_mean = (
    merged_df[["sex", "rating", "movie_id", "occupation"]]
    .groupby(["movie_id", "sex", "occupation"])
    .mean()
)

gen_occ_mean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rating
movie_id,sex,occupation,Unnamed: 3_level_1
1,F,administrator,3.9375
1,F,artist,5.0
1,F,educator,3.25
1,F,engineer,4.0
1,F,entertainment,4.0


In [40]:
def cf_weighted_mean_gender_occ(user_id, movie_id):
    score = np.nan
    gender = users.loc[user_id].sex
    occ = users.loc[user_id].occupation

    if (
        movie_id in r_matrix.columns
        and user_id in r_matrix.index
        and gender in gen_occ_mean.loc[movie_id].index
        and occ in gen_occ_mean.loc[movie_id, gender].index
    ):
        score = gen_occ_mean.loc[movie_id, gender, occ].values[0]

    return (
        score
        if not np.isnan(score)
        else r_matrix[movie_id].mean()
        if movie_id in r_matrix.columns
        else r_matrix.mean().mean()
    )

In [41]:
score(cf_weighted_mean_gender_occ)

1.2766510116760388