In [9]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")

movies = pd.read_csv(DATA_DIR / "tmdb_5000_movies.csv")
credits = pd.read_csv(DATA_DIR / "tmdb_5000_credits.csv")

print("Movies:", movies.shape)
print("Credits:", credits.shape)

movies.head(3)


Movies: (4803, 20)
Credits: (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [10]:
df = movies.merge(credits, on="title")
df = df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]
df.head(3)


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [11]:
import ast

def parse_names(text):
    # converts string like "[{'id':..., 'name': 'Action'}]" -> ["Action", ...]
    try:
        items = ast.literal_eval(text)
        return [i["name"] for i in items]
    except:
        return []

df["genres"] = df["genres"].apply(parse_names)
df["keywords"] = df["keywords"].apply(parse_names)
df["cast"] = df["cast"].apply(parse_names).apply(lambda x: x[:3])  # top 3 actors

def get_director(text):
    try:
        crew_list = ast.literal_eval(text)
        for person in crew_list:
            if person.get("job") == "Director":
                return [person.get("name")]
        return []
    except:
        return []

df["crew"] = df["crew"].apply(get_director)

df.head(3)


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [12]:
def clean_list(lst):
    # "Sam Worthington" -> "samworthington"
    return [s.replace(" ", "").lower() for s in lst]

df["overview"] = df["overview"].fillna("").apply(lambda x: x.lower())

df["genres"] = df["genres"].apply(clean_list)
df["keywords"] = df["keywords"].apply(clean_list)
df["cast"] = df["cast"].apply(clean_list)
df["crew"] = df["crew"].apply(clean_list)

df["tags"] = df["overview"] + " " + \
             df["genres"].apply(lambda x: " ".join(x)) + " " + \
             df["keywords"].apply(lambda x: " ".join(x)) + " " + \
             df["cast"].apply(lambda x: " ".join(x)) + " " + \
             df["crew"].apply(lambda x: " ".join(x))

final_df = df[["movie_id", "title", "tags"]]
final_df.head(3)


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf.fit_transform(final_df["tags"])

similarity = cosine_similarity(X)

def recommend(title, top_n=10):
    title = title.lower().strip()
    titles = final_df["title"].str.lower()

    if title not in set(titles):
        # show close matches
        suggestions = final_df[titles.str.contains(title, na=False)]["title"].head(10).tolist()
        return {"error": "Title not found", "suggestions": suggestions}

    idx = final_df[titles == title].index[0]
    scores = list(enumerate(similarity[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    return final_df.iloc[[i for i, _ in scores]][["title"]]


In [14]:
recommend("Avatar", top_n=10)


Unnamed: 0,title
3729,Falcon Rising
582,Battle: Los Angeles
3607,Apollo 18
47,Star Trek Into Darkness
539,Titan A.E.
942,The Book of Life
2405,Aliens
1916,Lifeforce
3537,Galaxina
557,Jarhead


In [16]:
type(recommend)


function