# Content-Based filtering
---
內容相似度 : 輸入電影尋找相似的電影並推薦(餘弦相似度)

概念 : 喜歡某電影，可能也喜歡類似該部的電影

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# movies
item_col = ['movie_id', 'movie_title' ,'release_date','video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure',
 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_df = pd.read_csv('input/ml-100k/u.item', sep="|", encoding='latin-1', names=item_col)

In [3]:
# 刪除內容相似度不需要的欄位
contentBased_df = item_df.drop(['release_date', 'video_release_date', 'IMDb_URL'], axis=1)
print('原始電影數 :', len(contentBased_df))
# 刪除相同movie_title不同movie_id但類型相同的重複電影
# contentBased_df.duplicated(subset=contentBased_df.columns.difference(['movie_id'])).sum() # 18
contentBased_df.drop_duplicates(subset=contentBased_df.columns.difference(['movie_id']), keep='last', inplace=True)
contentBased_df.reset_index(drop=True, inplace=True)
print('去重後電影數 :', len(contentBased_df))

原始電影數 : 1682
去重後電影數 : 1664


In [4]:
# 計算餘弦相似度
features_matrix = contentBased_df.iloc[:, 2:]
cosine_sim_features = cosine_similarity(features_matrix, features_matrix)

In [14]:
def get_similar_movies_bycontent(df, movie_id, seen = [], num_recom = 10):
    idx = movie_id - 1
    sim_scores = list(enumerate(cosine_sim_features[idx])) # 該idx電影對其他電影的相似度 (index, score)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:] # 排序分數並排除自己

    # 使用者已看過的就不推, 如果全部電影都已看過則返回空結果
    temp = []
    id = 0
    while len(temp) != num_recom and id < len(sim_scores):
        real_idx = df.loc[sim_scores[id][0], 'movie_id']
        if real_idx not in seen:
            temp.append((real_idx, sim_scores[id][1]))
        id += 1
    
    # 返回電影movie_id, movie_title, 餘弦相似度
    result = []
    for movie_idx, score in temp:
        movie_info = {
            'movie_id':movie_idx,
            'movie_title':df[df['movie_id'] == movie_idx]['movie_title'].values[0],
            'cosine_similarity': round(score, 4)
        }
        result.append(movie_info)

    wantedTofind = contentBased_df[contentBased_df['movie_id'] == movie_id]['movie_title'].values[0]
    print(f'Recommendation for moives silmlar with "{wantedTofind}"')
    print(f"You've already watched {len(seen)} movies those we don't recommend again!")
    for i, m in enumerate(seen, 1):
        print(i, contentBased_df[contentBased_df['movie_id'] == m]['movie_title'].values[0])

    return pd.DataFrame(result)

# Test

In [16]:
# get_similar_movies_bycontent(data_set, 來源電影編號, 已看過電影編號(內建無), 推薦幾部相似的(內建10))
# 電影編號1為 Toy Story (1995)
# seen = contentBased_df['movie_id'].values << 全電影都看過
seen = [700, 240, 18, 2]
get_similar_movies_bycontent(contentBased_df, 1, seen)

Recommendation for moives silmlar with "Toy Story (1995)"
You've already watched 4 movies those we don't recommend again!
1 Miami Rhapsody (1995)
2 Beavis and Butt-head Do America (1996)
3 White Balloon, The (1995)
4 GoldenEye (1995)


Unnamed: 0,movie_id,movie_title,cosine_similarity
0,422,Aladdin and the King of Thieves (1996),1.0
1,95,Aladdin (1992),0.866
2,1219,"Goofy Movie, A (1995)",0.866
3,63,"Santa Clause, The (1994)",0.8165
4,94,Home Alone (1990),0.8165
5,102,"Aristocats, The (1970)",0.8165
6,138,D3: The Mighty Ducks (1996),0.8165
7,139,"Love Bug, The (1969)",0.8165
8,169,"Wrong Trousers, The (1993)",0.8165
9,189,"Grand Day Out, A (1992)",0.8165
