In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# step1 데이터 로드 
data = pd.read_csv('../input_data/movie/tmdb_5000_movies.csv')

In [3]:
data.head(2)

# 많이 사용하는 중요 컬럼 리스트 (다른 컬럼은 그닥 사용 안됨)

# genres : 영화 장르
# keywords : 영화의 키워드
# original_language : 영화 언어
# title : 제목
# vote_average : 평점 평균
# vote_count : 평점 카운트
# popularity : 인기도
# overview : 개요 설명

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
data.shape

(4803, 20)

In [5]:
# step2) 전처리 
# 필요 컬럼만 걸러내기 
data = data[['id','genres', 'vote_average', 'vote_count','popularity','title',  'keywords', 'overview']]

In [6]:
# 2-1. weighted score 전처리 하기 

# 평점은 좀더 fair하게 재점수 매김. 예를들어 유저 10000명이 별점 매긴 평균이 3.0인애랑, 유저 3명이 별점 5점 매겨서 5점인거랑 다르잖아?
# 여기서 어떻게 매기는지 방법 설명해줌 (https://www.quora.com/How-does-IMDbs-rating-system-work)
# 그걸 임의의 기준점 잡아서 전처리 한 과정은 여기 설명되어있음(https://github.com/lsjsj92/recommender_system_with_Python/blob/master/002.%20recommender%20system%20basic%20with%20Python%20-%201%20content%20based%20filtering.ipynb)

tmp_m = data['vote_count'].quantile(0.89)
tmp_m
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape
del tmp_data

m = data['vote_count'].quantile(0.9)
data = data.loc[data['vote_count'] >= m]
# data.head()

C = data['vote_average'].mean()

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return ( v / (v+m) * R ) + (m / (m + v) * C)

data['score'] = data.apply(weighted_rating, axis = 1)

data.head(5)
# data.shape

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.168053
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500,139.082615,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",6.918271
2,206647,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.3,4466,107.376788,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,6.493333
3,49026,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",7.6,9106,112.31295,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,7.492998
4,49529,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.1,2124,43.926995,John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",6.500396


In [7]:
# 2-2. 장르/키워드 안에 여러 장르/키워드 들이 nosql로 들어있는걸 풀어서 하나의 string으로 풀기(스페이스바로 구분)
data['genres'] = data['genres'].apply(literal_eval)
data['keywords'] = data['keywords'].apply(literal_eval)

data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))
data['keywords'] = data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [8]:
# 전처리 끝난 데이터를 별개의 .csv로 저장 
data.to_csv('../input_data/movie/tmdb_5000_movies_after_preprocessing.csv', index = False)

# 주의!
내용이 바뀌었다고 함 
위에 전처리한건 나중에 movielens 데이터 전처리 할 때 필요한 요소일 수 있으니 삭제하지 않고 유지함

아래는 새롭게 시작!

# Content 기반 컨텐츠 추천!

In [9]:
# step1) 데이터 로드하자
movie_data = pd.read_csv('../input_data/movie/movies_metadata.csv')
movie_data =  movie_data.loc[movie_data['original_language'] == 'en', :]
movie_data = movie_data[['id', 'title', 'original_language', 'genres']]

print(movie_data.shape)
movie_data.head()

(32269, 4)


  movie_data = pd.read_csv('../input_data/movie/movies_metadata.csv')


Unnamed: 0,id,title,original_language,genres
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]"


In [10]:
movie_keyword = pd.read_csv('../input_data/movie/keywords.csv')
print(movie_keyword.shape)
movie_keyword.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [11]:
# 두 데이터 merge 
movie_data.id = movie_data.id.astype(int)
movie_keyword.id = movie_keyword.id.astype(int)
movie_data = pd.merge(movie_data, movie_keyword, on='id')
print(movie_data.shape)
movie_data.head()
# movie_data.tail()

(32852, 5)


Unnamed: 0,id,title,original_language,genres,keywords
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [12]:
# step2) 전처리 
# nosql 형태를 string 형태(spacebar로 구분)으로 merge 하기 (genre, keywords column)
movie_data['genres'] = movie_data['genres'].apply(literal_eval)
movie_data['genres'] = movie_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [13]:
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

print(movie_data.shape)
movie_data.head()

(32852, 5)


Unnamed: 0,id,title,original_language,genres,keywords
0,862,Toy Story,en,Animation Comedy Family,jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,en,Adventure Fantasy Family,board game disappearance based on children's b...
2,15602,Grumpier Old Men,en,Romance Comedy,fishing best friend duringcreditsstinger old men
3,31357,Waiting to Exhale,en,Comedy Drama Romance,based on novel interracial relationship single...
4,11862,Father of the Bride Part II,en,Comedy,baby midlife crisis confidence aging daughter ...


In [14]:
# TF-IDF 벡터화
# 장르, 키워드를 하나로 합친 후, 하나의 TF-IDF로 벡터화 

In [15]:
tfidf_vector = TfidfVectorizer()
#tfidf_vector = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + " " + movie_data['keywords']).toarray()
#tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres']).toarray()
tfidf_matrix_feature = tfidf_vector.get_feature_names()

print(tfidf_matrix.shape)

tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=tfidf_matrix_feature, index = movie_data.title)

tfidf_matrix.head()

(32852, 11437)




Unnamed: 0_level_0,077,10,11,13,1500s,15th,16th,17th,1812,18th,...,βάφτηκε,γη,κόκκινο,το,χώμα,миньоны,卧底肥妈,绝地奶霸,自然界大事件,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# step3)벡터 유사도 구하기 
# tf-idf vector를 코사인 유사도를 활용해서 유사도 값을 구해줍니다. 
# 이렇게 하면 영화 개수(n)만큼 n x n의 matirx 형태가 나오게 됩니다.

In [None]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix) # CPU times: user 5min, sys: 11 s, total: 5min 11s
                                             # Wall time: 1min 5s
                                             # 겁나 오래걸려서 내 컴퓨터로는 안돌아가고 꺼짐 ㅠㅠ

In [None]:
cosine_sim.shape # (32852, 32852)

In [None]:
cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.title, columns = movie_data.title)
print(cosine_sim_df.shape)
cosine_sim_df.head() # 저 위에 행:영화 이름, 열: 영화 이름, 서로 유사도 측정. 같은 영화는 유사도 1임. 대각선 다 1. 

In [None]:
# 위에 유사도 기반 추천 함수 

def genre_recommendations(target_title, matrix, items, k=10):
    recom_idx = matrix.loc[:, target_title].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_title_list = np.full(len(range(k)), target_title)
    target_genre_list = np.full(len(range(k)), items[items.title == target_title].genres.values)
    d = {
        'target_title':target_title_list,
        'target_genre':target_genre_list,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    return pd.DataFrame(d)

genre_recommendations('The Dark Knight Rises', cosine_sim_df, movie_data)