In [2]:
import pandas as pd 
import numpy as np
df_credits = pd.read_csv('/Users/mac/tmdb_5000_credits.csv') 
df_credits.columns = ['id','title','cast','crew']
df_movies = pd.read_csv('/Users/mac/tmdb_5000_movies.csv')
df_movies = df_movies.merge(df_credits[['id','cast','crew']], on='id',how='outer')
df_movies.drop(columns=['budget','homepage','popularity', 'release_date', 'revenue', 'runtime', 'vote_average', 'vote_count'], inplace=True)
print('Dataset Shape: ',df_movies.shape)
df_movies.head(2)

FileNotFoundError: File b'/Users/mac/tmdb_5000_credits.csv' does not exist

In [109]:
df_movies[['cast','crew','keywords','genres']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 4 columns):
cast        4803 non-null object
crew        4803 non-null object
keywords    4803 non-null object
genres      4803 non-null object
dtypes: object(4)
memory usage: 187.6+ KB


In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
overview = df_movies['overview'].fillna('')
overview_tfidf = TfidfVectorizer(stop_words='english')
overview_tfidf_matrix = overview_tfidf.fit_transform(overview)
overview_tfidf_matrix.shape

(4803, 20978)

In [111]:
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
features = ['cast','crew','keywords','genres']
for feature in features:
    df_movies[feature] = df_movies[feature].apply(literal_eval)
df_meta = df_movies[features].copy()

In [112]:
def get_crew(crews,job='Director'):
    names = []
    for i in crews:
        if i['job'] == job:
            names.append(i['name'])
    return names

def get_list(casts,limit=3):
    if isinstance(casts,list):
        names = [i['name'] for i in casts]
        if len(names)>limit:
            names = names[:limit]
        return names
df_meta['director'] = df_meta['crew'].apply(get_crew)
df_meta['writers'] = df_meta['crew'].apply(get_crew,job='Screenplay')
for feature in  ['keywords','genres']:
    df_meta[feature] = df_meta[feature].apply(get_list, limit=10)
df_meta['cast'] = df_meta['cast'].apply(get_list)
df_meta.head()

Unnamed: 0,cast,crew,keywords,genres,director,writers
0,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{'credit_id': '52fe48009251416c750aca23', 'de...","[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, Science Fiction]",[James Cameron],[James Cameron]
1,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]",[Gore Verbinski],"[Ted Elliott, Terry Rossio]"
2,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{'credit_id': '54805967c3a36829b5002c41', 'de...","[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime]",[Sam Mendes],"[John Logan, Robert Wade, Neal Purvis, Jez But..."
3,"[Christian Bale, Michael Caine, Gary Oldman]","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[dc comics, crime fighter, terrorist, secret i...","[Action, Crime, Drama, Thriller]",[Christopher Nolan],"[Christopher Nolan, Jonathan Nolan]"
4,"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","[based on novel, mars, medallion, space travel...","[Action, Adventure, Science Fiction]",[Andrew Stanton],"[Andrew Stanton, Michael Chabon, Mark Andrews]"


In [113]:
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x,str):
        return str.lower(x.replace(" ", ""))
    return ''
for feature in ['cast','keywords','director','writers','genres']:
    df_meta[feature] = df_meta[feature].apply(clean_data)
def create_soup(x):
    return ' '.join(x['keywords'])+' '+' '.join(x['cast'])+' '+' '.join(x['writers'])+' '+' '.join(x['director'])+' '+' '.join(x['genres'])
df_meta['soup'] = df_meta.apply(create_soup,axis=1)
df_meta['soup'][:5] 

0    cultureclash future spacewar spacecolony socie...
1    ocean drugabuse exoticisland eastindiatradingc...
2    spy basedonnovel secretagent sequel mi6 britis...
3    dccomics crimefighter terrorist secretidentity...
4    basedonnovel mars medallion spacetravel prince...
Name: soup, dtype: object

In [114]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
meta = df_meta['soup']
meta_matrix = count.fit_transform(meta)
print(meta_matrix.shape)

(4803, 18128)


In [115]:
indices = pd.Series(df_movies.index,index=df_movies['title'])

from sklearn.metrics.pairwise import cosine_similarity
overview_cosine = linear_kernel(overview_tfidf_matrix, overview_tfidf_matrix)
meta_cosine = cosine_similarity(meta_matrix,meta_matrix)

def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    df_recommended = df_movies[['title']].iloc[movie_indices]
    df_recommended.index = range(1,11)
    df_recommended.rename(columns={'title':'相似电影','vote_average':'评分'}, inplace=True)
    return df_recommended

In [128]:
get_recommendations("Frozen",overview_cosine)

Unnamed: 0,相似电影
1,Stardust
2,Ida
3,Leap Year
4,The Promise
5,Splash
6,Two Girls and a Guy
7,The Prince of Tides
8,Forrest Gump
9,Royal Kill
10,Black Snake Moan


In [129]:
get_recommendations("Frozen",meta_cosine)

Unnamed: 0,相似电影
1,Valiant
2,Return to Never Land
3,Pocahontas
4,Tangled
5,Atlantis: The Lost Empire
6,Alpha and Omega: The Legend of the Saw Tooth Cave
7,Aladdin
8,Enchanted
9,The Smurfs 2
10,Why I Did (Not) Eat My Father
