In [31]:
import pandas as pd
import numpy as np

In [32]:
df1 = pd.read_csv('data/tmdb_5000_credits.csv')
df2 = pd.read_csv('data/tmdb_5000_movies.csv')

In [33]:
df1.columns = ['id', 'title', 'cast', 'crew']
df_temp = df1[['id', 'cast', 'crew']]
df2 = df2.merge(df_temp, on='id')
df2['overview'].isnull().any()
True
df2['overview'] = df2['overview'].fillna('')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df2['overview'])
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()
indices

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [34]:
indices['The Dark Knight Rises']

np.int64(3)

In [35]:
test_sim_scores = list(enumerate(cosine_sim[3]))
test_sim_scores

[(0, np.float64(0.024995115837672686)),
 (1, np.float64(0.0)),
 (2, np.float64(0.0)),
 (3, np.float64(0.9999999999999994)),
 (4, np.float64(0.010433403719159351)),
 (5, np.float64(0.005144601815810792)),
 (6, np.float64(0.012600632435462458)),
 (7, np.float64(0.02695427057891266)),
 (8, np.float64(0.0206522168853895)),
 (9, np.float64(0.13374009066555226)),
 (10, np.float64(0.0)),
 (11, np.float64(0.0)),
 (12, np.float64(0.0)),
 (13, np.float64(0.0)),
 (14, np.float64(0.0)),
 (15, np.float64(0.0040713339225121065)),
 (16, np.float64(0.021121093874993176)),
 (17, np.float64(0.0)),
 (18, np.float64(0.006768893195007469)),
 (19, np.float64(0.010765175685064705)),
 (20, np.float64(0.007178266390761149)),
 (21, np.float64(0.033380775071488206)),
 (22, np.float64(0.0)),
 (23, np.float64(0.0)),
 (24, np.float64(0.019238168304196286)),
 (25, np.float64(0.01701338816136818)),
 (26, np.float64(0.018845673291717255)),
 (27, np.float64(0.0)),
 (28, np.float64(0.008351231142809444)),
 (29, np.float

In [36]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = df2[df2['title']==title].index[0]  #영화 제목을 통해 영화의 index 값을 얻기
    sim_scores = list(enumerate(cosine_sim[idx]))  #cosine 유사도에서 idx에 해당하는 데이터를 [idx, 유사도] 형태로 얻기
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  #cosine 유사도 기준으로 내림차순 정렬
    sim_scores = sim_scores[1: 11]  #자기 자신을 제외한 10개의 추천 영화를 Slicing
    movie_indices = [i[0] for i in sim_scores]  #추천 영화 목록 10개의 index 정보 추출
    titles = df2['title'].loc[movie_indices]  #index 정보를 통해 영화 제목 추출
    return titles

In [37]:
get_recommendations('Avatar')

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [38]:
from ast import literal_eval
df2['genres'] = df2['genres'].apply(literal_eval)

In [39]:
df2.loc[0, 'genres']
[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]
features = ['cast', 'crew', 'keywords']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)
def get_director(x):
    for i in x: 
        if i['job'] == 'Director':
            return i['name']
    return np.nan  #x값이 없는 경우 np.nan를 return 한다.
df2['director'] = df2['crew'].apply(get_director)
df2['director']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4          Andrew Stanton
              ...        
4798     Robert Rodriguez
4799         Edward Burns
4800          Scott Smith
4801          Daniel Hsia
4802     Brian Herzlinger
Name: director, Length: 4803, dtype: object

In [40]:
filt = df2['director'].isnull()
df2[filt]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
3661,0,"[{'id': 18, 'name': 'Drama'}]",,19615,[],en,Flying By,A real estate developer goes to his 25th high ...,1.546169,[],...,95.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,It's about the music,Flying By,7.0,2,"[{'cast_id': 1, 'character': 'George', 'credit...",[],
3670,0,"[{'id': 10751, 'name': 'Family'}]",,447027,[],en,Running Forever,After being estranged since her mother's death...,0.028756,"[{""name"": ""New Kingdom Pictures"", ""id"": 41671}]",...,88.0,[],Released,,Running Forever,0.0,0,[],[],
3729,3250000,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.paathefilm.com/,26379,[],en,Paa,He suffers from a progeria like syndrome. Ment...,2.126139,"[{""name"": ""A B Corp"", ""id"": 4502}]",...,133.0,"[{""iso_639_1"": ""hi"", ""name"": ""\u0939\u093f\u09...",Released,,Paa,6.6,19,"[{'cast_id': 1, 'character': 'Auro', 'credit_i...","[{'credit_id': '52fe44fec3a368484e042a29', 'de...",
3977,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,55831,"[{'id': 10183, 'name': 'independent film'}]",en,Boynton Beach Club,A handful of men and women of a certain age pi...,0.18887,[],...,105.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Boynton Beach Club,6.8,3,"[{'cast_id': 1, 'character': 'Marilyn', 'credi...",[],
4068,0,[],,371085,[],en,Sharkskin,The Post War II story of Manhattan born Mike E...,0.027801,[],...,0.0,[],Released,,Sharkskin,0.0,0,[],[],
4105,2000000,[],,48382,[],en,"The Book of Mormon Movie, Volume 1: The Journey",The story of Lehi and his wife Sariah and thei...,0.031947,[],...,120.0,[],Released,"2600 years ago, one family began a remarkable ...","The Book of Mormon Movie, Volume 1: The Journey",5.0,2,"[{'cast_id': 1, 'character': 'Sam', 'credit_id...",[],
4118,0,[],,325140,[],en,Hum To Mohabbat Karega,"Raju, a waiter, is in love with the famous TV ...",0.001186,[],...,0.0,[],Released,,Hum To Mohabbat Karega,0.0,0,[],[],
4123,7000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://www.roadsideromeo.com/,20653,[],en,Roadside Romeo,This is the story of Romeo. A dude who was liv...,0.253595,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,93.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,Roadside Romeo,6.7,3,"[{'cast_id': 1, 'character': 'Romeo', 'credit_...",[],
4247,1,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,361505,[],en,Me You and Five Bucks,"A womanizing yet lovable loser, Charlie, a wai...",0.094105,[],...,90.0,[],Released,"A story about second, second chances",Me You and Five Bucks,10.0,2,[],[],
4305,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,114065,[],en,Down & Out With The Dolls,"The raunchy, spunky tale of the rise and fall ...",0.002386,[],...,88.0,[],Released,Ain't Rock 'N' Roll a bitch.,Down & Out With The Dolls,0.0,0,[],[],


In [41]:
filt = df2['director'].isnull()
df2[filt]
def get_list(x):
  if isinstance(x, list):
    names = [i['name'] for i in x]
    if len(names) > 3:
        names = names[:3]
    return names
  return []
features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)
df2[['title', 'director', 'cast', 'keywords', 'genres']].head(3)

Unnamed: 0,title,director,cast,keywords,genres
0,Avatar,James Cameron,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,Sam Mendes,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[spy, based on novel, secret agent]","[Action, Adventure, Crime]"


In [43]:
def clean_data(x):
  if isinstance(x, list):
    return [str.lower(i.replace(' ', '')) for i in x]
  else:
    if isinstance(x, str):
      return str.lower(x.replace(' ', ''))
    else:
      return ''
features = ['cast', 'keywords', 'genres', 'director']
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)
df2[['title', 'director', 'cast', 'keywords', 'genres']].head(3)
df2[features].head(3)

Unnamed: 0,cast,keywords,genres,director
0,"[samworthington, zoesaldana, sigourneyweaver]","[cultureclash, future, spacewar]","[action, adventure, fantasy]",jamescameron
1,"[johnnydepp, orlandobloom, keiraknightley]","[ocean, drugabuse, exoticisland]","[adventure, fantasy, action]",goreverbinski
2,"[danielcraig, christophwaltz, léaseydoux]","[spy, basedonnovel, secretagent]","[action, adventure, crime]",sammendes


In [44]:
def create_soup(x):
  str = ' '.join(x['keywords']) + ' '
  str += ' '.join(x['cast']) + ' '
  str += (x['director']) + ' '
  str += ' '.join(x['genres'])
  return str
df2['soup'] = df2.apply(create_soup, axis=1)
df2['soup']

0       cultureclash future spacewar samworthington zo...
1       ocean drugabuse exoticisland johnnydepp orland...
2       spy basedonnovel secretagent danielcraig chris...
3       dccomics crimefighter terrorist christianbale ...
4       basedonnovel mars medallion taylorkitsch lynnc...
                              ...                        
4798    unitedstates–mexicobarrier legs arms carlosgal...
4799     edwardburns kerrybishé marshadietlein edwardb...
4800    date loveatfirstsight narration ericmabius kri...
4801       danielhenney elizacoupe billpaxton danielhsia 
4802    obsession camcorder crush drewbarrymore brianh...
Name: soup, Length: 4803, dtype: object

In [45]:
import pandas as pd
df = pd.read_csv('data/tmdb_5000_movies.csv')
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [46]:
df['overview'].isnull().any()
False
df['overview'] = df['overview'].fillna('')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(4803, 20978)

In [47]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape
(4803, 4803)
title= 'Avatar'
idx = df[df['title']==title].index[0]
idx

np.int64(0)

In [48]:
sim_scorse = list(enumerate(cosine_sim[idx]))
sim_scorse

[(0, np.float64(0.9999999999999998)),
 (1, np.float64(0.0)),
 (2, np.float64(0.0)),
 (3, np.float64(0.024995115837672686)),
 (4, np.float64(0.0)),
 (5, np.float64(0.030352543844312897)),
 (6, np.float64(0.0)),
 (7, np.float64(0.037580696903430855)),
 (8, np.float64(0.0)),
 (9, np.float64(0.0)),
 (10, np.float64(0.0)),
 (11, np.float64(0.019787662886189515)),
 (12, np.float64(0.0)),
 (13, np.float64(0.0)),
 (14, np.float64(0.0)),
 (15, np.float64(0.0)),
 (16, np.float64(0.0)),
 (17, np.float64(0.0)),
 (18, np.float64(0.0)),
 (19, np.float64(0.0)),
 (20, np.float64(0.0)),
 (21, np.float64(0.0)),
 (22, np.float64(0.0)),
 (23, np.float64(0.0)),
 (24, np.float64(0.0)),
 (25, np.float64(0.0)),
 (26, np.float64(0.03496958297818529)),
 (27, np.float64(0.047054527099890414)),
 (28, np.float64(0.0)),
 (29, np.float64(0.02518543163480528)),
 (30, np.float64(0.0)),
 (31, np.float64(0.06153392877569967)),
 (32, np.float64(0.0)),
 (33, np.float64(0.0)),
 (34, np.float64(0.0)),
 (35, np.float64(0.0))

In [49]:
sim_scorse=sorted(sim_scorse, key=lambda x:x[1], reverse=True)
sim_scorse=sim_scorse[1:11]
sim_scorse

[(3604, np.float64(0.1868100105621205)),
 (2130, np.float64(0.17015105119162663)),
 (634, np.float64(0.13230696630317224)),
 (1341, np.float64(0.12393264590871877)),
 (529, np.float64(0.12248959164893025)),
 (1610, np.float64(0.1046990303761563)),
 (311, np.float64(0.10091388184065114)),
 (847, np.float64(0.09799623121705055)),
 (775, np.float64(0.09447938436890334)),
 (2628, np.float64(0.09313854469530522))]

In [50]:
movie_indices = [i[0] for i in sim_scorse]
movie_indices

[3604, 2130, 634, 1341, 529, 1610, 311, 847, 775, 2628]

In [51]:
titles = df['title'].loc[movie_indices]
titles

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [52]:
def get_recommendations(title, cosine_sim=cosine_sim):
  idx = df[df['title']==title].index[0]
  sim_scorse = list(enumerate(cosine_sim[idx]))
  sim_scorse=sorted(sim_scorse, key=lambda x:x[1], reverse=True)
  sim_scorse=sim_scorse[1:11]
  movie_indices = [i[0] for i in sim_scorse]
  titles = df['title'].loc[movie_indices]
  return titles

In [53]:
get_recommendations('Avatar')

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [54]:
df['title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4798                                 El Mariachi
4799                                   Newlyweds
4800                   Signed, Sealed, Delivered
4801                            Shanghai Calling
4802                           My Date with Drew
Name: title, Length: 4803, dtype: object

In [55]:
movies=df[['id', 'title']].copy()
movies

Unnamed: 0,id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter
...,...,...
4798,9367,El Mariachi
4799,72766,Newlyweds
4800,231617,"Signed, Sealed, Delivered"
4801,126186,Shanghai Calling


In [56]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

In [57]:
import pickle
pickle.dump(movies, open('data/movies.pickle', 'wb'))
pickle.dump(cosine_sim, open('data/cosine_sim.pickle', 'wb'))