In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("input/cleaned_data.csv")

In [3]:
# 取出所需欄位
df = df[['user_id', 'movie_id', 'rating', 'movie_title']]

# 同用戶同電影重複評分，取平均
average_ratings = df.groupby(['user_id', 'movie_title'], as_index=False).agg({"rating":"mean"})
data = df.merge(average_ratings, on=['user_id', 'movie_title'], suffixes=('', '_avg'))
data.drop(columns=['rating'], inplace=True)
data.rename(columns={'rating_avg': 'rating'}, inplace=True)

In [4]:
# 去重複
cleaned_data = data.drop_duplicates(subset=data.columns.difference(['movie_id']), keep='last')
cleaned_data.nunique()

user_id         943
movie_id       1681
movie_title    1664
rating            9
dtype: int64

In [5]:
matrix = cleaned_data.pivot_table(index='movie_title', columns='user_id', values='rating', fill_value=0.0)
item_similarity = cosine_similarity(matrix)
matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1-900 (1994),0,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
101 Dalmatians (1996),2,0,0,0,2,0,0,0,0,0,...,2.0,0,0,2,4,0,0,0,0,0
12 Angry Men (1957),5,0,0,0,0,4,4,0,0,5,...,0.0,0,0,0,0,0,0,0,0,0
187 (1997),0,0,2,0,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Guns II (1990),0,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,3
"Young Poisoner's Handbook, The (1995)",0,0,0,0,0,0,3,0,0,0,...,0.0,0,5,0,0,0,0,0,0,0
Zeus and Roxanne (1997),0,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
unknown,4,0,0,0,4,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [6]:
item_similarity_df = pd.DataFrame(item_similarity, index=matrix.index, columns=matrix.index)

# Test

In [22]:
def get_similar_movies_byCF(df : pd.DataFrame, movie_title : str, user_id : int, num_recomm : int = 10 ) -> pd.DataFrame:
    similar_movies = item_similarity_df[movie_title].sort_values(ascending=False)
    
    # 該user已看過的電影，並從相似度推薦表刪除
    user_rated = cleaned_data[df['user_id'] == user_id]['movie_title']
    res = similar_movies.drop(user_rated, errors='ignore')
    # Output
    print(f"Recommendation for moives silmlar with '{movie_title}'")
    print(f"You've already watched {len(user_rated)} movies those we don't recommend again!")
    return res.head(num_recomm)

target = "Jerry Maguire (1996)"
user = 900
get_similar_movies_byCF(cleaned_data, target, user)

Recommendation for moives silmlar with 'Jerry Maguire (1996)'
You've already watched 45 movies those we don't recommend again!


movie_title
Toy Story (1995)                       0.624075
Time to Kill, A (1996)                 0.614067
Mr. Holland's Opus (1995)              0.607334
Ransom (1996)                          0.605525
Star Wars (1977)                       0.601816
Truth About Cats & Dogs, The (1996)    0.589499
Birdcage, The (1996)                   0.581532
Return of the Jedi (1983)              0.574760
Phenomenon (1996)                      0.572837
Primal Fear (1996)                     0.556389
Name: Jerry Maguire (1996), dtype: float64

In [9]:
# 如果要推該user，找出喜好程度大的(推薦高評分相似的電影)，盡量不推低評分 >> 加權
cleaned_data[cleaned_data['user_id'] == user][['movie_title', 'rating']].sort_values(by='rating',ascending=False)

Unnamed: 0,movie_title,rating
92745,"Wild Bunch, The (1969)",5.0
9628,Jerry Maguire (1996),4.0
66170,Lone Star (1996),4.0
59614,Casablanca (1942),4.0
56602,Dr. Strangelove or: How I Learned to Stop Worr...,4.0
71244,Bound (1996),4.0
47646,Schindler's List (1993),4.0
72806,North by Northwest (1959),4.0
60947,Patton (1970),4.0
90489,High Noon (1952),4.0
