# 컨텐츠 기반 영화 추천 시스템을 만들어 보자

https://www.kaggle.com/tmdb/tmdb-movie-metadata

이 데이터셋은 영화에 대한 정보(예산, 장르, 평점, 런닝시간, 제목, 키워드 등등)와 영화 연출팀과 배우들에 대한 별도 정보로 구성되어 있으며 일반 사용자들이 이 정보를 입력하여 구성되었다 (위키와 흡사)

내가 좋아한 (리뷰를 좋게 준) 영화와 비슷한 영화들(컨텐츠 측면에서)을 찾아서 추천

## 입력 데이터 로딩

*   영화 정보는 tmdb_5000_movies.csv라는 파일에 있다. 이 파일만 사용할 예정


In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("https://grepp-reco-test.s3.ap-northeast-2.amazonaws.com/tmdb_5000_movies.csv")

In [3]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
movies.shape

(4803, 20)

In [5]:
import json

def f(j):
    genres = []
    ar = json.loads(j)
    for a in ar:
        genres.append(a.get("name"))
    return " ".join(sorted(genres))

movies['genres_name'] = movies.apply(lambda x: f(x.genres), axis=1)

In [6]:
movies[['genres_name']].head()   # vs. movies['genres_name'].head()

Unnamed: 0,genres_name
0,Action Adventure Fantasy Science Fiction
1,Action Adventure Fantasy
2,Action Adventure Crime
3,Action Crime Drama Thriller
4,Action Adventure Science Fiction


In [7]:
movies['genres_name'].nunique()

638

In [8]:
movies.groupby('genres_name').size()

genres_name
                                            28
Action                                      21
Action Adventure                            15
Action Adventure Animation                   1
Action Adventure Animation Comedy Family     5
                                            ..
Science Fiction Thriller                    12
Thriller                                    23
War                                          2
War Western                                  1
Western                                     18
Length: 638, dtype: int64

## 여러 텍스트 필드들을 모아서 텍스트 유사도에 사용할 텍스트 필드 하나를 생성

In [9]:
for f in ['original_title','overview','genres_name']:
  movies[f] = movies[f].fillna('')

In [11]:
def combine_features(row):
	try:
		return row['original_title']+" "+row['overview']+" "+row["genres_name"]
	except:
		print ("Error:", row)

In [12]:
movies["combined_features"] = movies.apply(combine_features,axis=1)
movies = movies.reset_index()

In [13]:
movies["combined_features"].head()

0    Avatar In the 22nd century, a paraplegic Marin...
1    Pirates of the Caribbean: At World's End Capta...
2    Spectre A cryptic message from Bond’s past sen...
3    The Dark Knight Rises Following the death of D...
4    John Carter John Carter is a war-weary, former...
Name: combined_features, dtype: object

## TF-IDF 기반 벡터 생성 후 코사인 유사도로 영화들간의 유사도 계산

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english', norm='l2')

In [16]:
tfidf_matrix = tfidfvectorizer.fit_transform(movies["combined_features"])

In [17]:
tfidf_matrix.shape    #  min_df 파라미터!!!

(4803, 22179)

In [18]:
cosine_sim = cosine_similarity(tfidf_matrix) # linear_kernel을 사용해도 동일함. tfidf 벡터가 생성될 때 L2 normalization이 되었기 때문

In [19]:
df_cosine_sim = pd.DataFrame(data = cosine_sim)
df_cosine_sim.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
0,1.0,0.034109,0.013909,0.026299,0.029273,0.046984,0.003855,0.068451,0.020614,0.020293,...,0.0,0.0,0.026697,0.057602,0.0,0.003941,0.0,0.0,0.0,0.0
1,0.034109,1.0,0.014576,0.004304,0.047903,0.023093,0.00404,0.039814,0.021603,0.029597,...,0.016368,0.044292,0.006,0.0,0.0,0.00413,0.0,0.022717,0.016449,0.0
2,0.013909,0.014576,1.0,0.008018,0.009303,0.009417,0.003084,0.041851,0.023712,0.008672,...,0.022647,0.0,0.0,0.0,0.015928,0.007694,0.0,0.011636,0.0,0.0
3,0.026299,0.004304,0.008018,1.0,0.011748,0.007161,0.014069,0.027766,0.028148,0.158541,...,0.002655,0.00302,0.001183,0.004516,0.001678,0.007343,0.0,0.029067,0.038273,0.019549
4,0.029273,0.047903,0.009303,0.011748,1.0,0.007872,0.011027,0.054503,0.005454,0.019947,...,0.010447,0.0,0.0,0.01319,0.0,0.002636,0.0,0.004744,0.0,0.0


## 컨텐츠 기반 추천 함수 만들기

In [20]:
def get_title_from_index(df, index):
	return df[df.index == index]["original_title"].values[0]

def get_index_from_title(df, title):
	return df[df.original_title == title]["index"].values[0]

In [21]:
cosine_sim[0]

array([1.        , 0.03410854, 0.01390903, ..., 0.        , 0.        ,
       0.        ])

In [None]:
for cs in enumerate(cosine_sim[0]):
  print(cs)

In [23]:
def reco_top_similar_movies(movie_title, n=10):
  movie_index = get_index_from_title(movies, movie_title)
  similar_movies =  enumerate(cosine_sim[movie_index])
  sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

  ret_movies = []
  i = 0
  for element in sorted_similar_movies:
    title = get_title_from_index(movies, element[0])
    ret_movies.append(title)
    i=i+1
    if i >= n:
      break
  return ret_movies

In [24]:
print(reco_top_similar_movies('Avatar', 5))

['Avatar', 'Apollo 18', 'The American', 'Obitaemyy Ostrov', 'The Matrix']


In [None]:
print(reco_top_similar_movies('Minions', 5))

['Minions', 'Despicable Me 2', 'Stuart Little 2', 'Stuart Little', 'Austin Powers: The Spy Who Shagged Me']


In [None]:
print(reco_top_similar_movies('Harry Potter and the Half-Blood Prince', 5))

['Harry Potter and the Half-Blood Prince', 'Harry Potter and the Goblet of Fire', 'Harry Potter and the Order of the Phoenix', 'Harry Potter and the Chamber of Secrets', 'Harry Potter and the Prisoner of Azkaban']


In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class ContentBasedRecommender:
    def __init__(self, items, user_ratings):
        self.items = items
        self.user_ratings = user_ratings
        self.item_profiles = self.build_item_profiles()

    def build_item_profiles(self):
        item_profiles = {}
        for item, genres in self.items.items():
            profile = np.zeros(len(set([genre for genres in self.items.values() for genre in genres])))
            for genre in genres:
                profile[list(set(profile)).index(genre)] = 1
            item_profiles[item] = profile
        return item_profiles

    def get_user_profile(self):
        user_profile = np.zeros(len(set([genre for genres in self.items.values() for genre in genres])))
        for item, rating in self.user_ratings.items():
            user_profile += self.item_profiles[item] * rating
        return user_profile

    def recommend(self):
        user_profile = self.get_user_profile()
        recommendations = {}
        for item, profile in self.item_profiles.items():
            similarity = cosine_similarity([user_profile], [profile])[0][0]
            recommendations[item] = similarity
        sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
        return [item for item, _ in sorted_recommendations]

# 예시 데이터
items = {
    'Movie1': ['Action', 'Adventure'],
    'Movie2': ['Drama', 'Romance'],
    'Movie3': ['Action', 'Comedy'],
    'Movie4': ['Drama', 'Thriller'],
    'Movie5': ['Comedy', 'Romance'],
}

user_ratings = {
    'Movie1': 5,
    'Movie2': 4,
    'Movie3': 3,
}

# 추천 시스템 사용
recommender = ContentBasedRecommender(items, user_ratings)
recommendations = recommender.recommend()
print("추천 영화 순서:", recommendations)

ValueError: 'Action' is not in list