## Content Based Recommendation System

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
credits.columns = ['id','title','cast','crew']
df = movies.merge(credits,on='id')

In [4]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Dropping the columns that wont be used to train the model

In [5]:
cols_to_drop = ['budget', 'homepage', 'id', 'original_language', 'popularity', 'runtime', 'spoken_languages', 'status', 'title_y',
                'tagline', 'production_countries', 'release_date', 'revenue', 'vote_average', 'vote_count', 'title_x']
df = df.drop(columns=cols_to_drop)

### Getting important data to train the model

In [6]:
features = ['genres', 'keywords', 'cast', 'crew', 'production_companies']
for feature in features:
    df[feature] = df[feature].apply(lambda x: literal_eval(str(x)))

In [7]:
# function that returns the director's name from each movie
def get_directors_name(crew):
    for person in crew:
        if person['job'] == 'Director':
            return person['name']
    return np.nan

In [8]:
df['director'] = df['crew'].apply(get_directors_name)
df = df.drop(columns='crew')

In [9]:
# function that returns the first three genres, keywords, cast and production_companies from the movie
def extract_names(lst):
    if isinstance(lst, list):
        names = [item['name'] for item in lst[:3] if 'name' in item]
        return names
    return []

In [10]:
columns_to_process = ['genres', 'keywords', 'cast', 'production_companies']
for column in columns_to_process:
    df[column] = df[column].apply(extract_names)

In [11]:
df.head()

Unnamed: 0,genres,keywords,original_title,overview,production_companies,cast,director
0,"[Action, Adventure, Fantasy]","[culture clash, future, space war]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent]",Spectre,A cryptic message from Bond’s past sends him o...,"[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes
3,"[Action, Crime, Drama]","[dc comics, crime fighter, terrorist]",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan
4,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion]",John Carter,"John Carter is a war-weary, former military ca...",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton


### Cleaning data for vectorization and model training

In [12]:
def preprocess_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

In [13]:
columns_to_preprocess = ['cast', 'keywords', 'production_companies', 'director', 'genres']

for column in columns_to_preprocess:
    df[column] = df[column].apply(preprocess_data)

In [14]:
# Generate a combined text feature by merging information from various columns of a DataFrame row that are going to be used to train the model
def generate_combined_text(row):
    combined_feature = ' '.join(row['keywords']) + ' ' + ' '.join(row['cast']) + ' ' + row['director'] + ' ' + ' '.join(row['genres']) + ' ' + ' '.join(row['production_companies'])
    return combined_feature

In [15]:
df['combined_text'] = df.apply(generate_combined_text, axis=1)

### Getting the cosine similarity matrix

In [16]:
vectorizer = CountVectorizer(stop_words='english')
cv_matrix = vectorizer.fit_transform(df['combined_text'])

cosine_sim_matrix = cosine_similarity(cv_matrix, cv_matrix)

### Getting the predictions

In [17]:
# Reset index of the main df and create a reverse mapping Series that links movie titles to their corresponding indices in the df
df_aux = df.reset_index()
indices = pd.Series(df.index, index=df_aux['original_title'])

In [18]:
def recommend_movies(title, cosine_sim_matrix, num_recommendations=10):
    """
    Get recommended movies based on input movie title.

    Parameters:
    title (str): The title of the movie for which recommendations are needed.
    cosine_sim_matrix (numpy.ndarray): The cosine similarity matrix between movies.
    num_recommendations (int): The number of recommended movies to return.

    Returns:
    pandas.Series: Series containing the titles of recommended movies.
    """

    # index of the chosen movie title
    index = indices.get(title)

    if index is None:
        raise ValueError("Movie not found")

    # scores of all movies with the chosen movie title
    scores = list(enumerate(cosine_sim_matrix[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Removing the input movie from the recommendations
    scores = scores[1:]

    # Get the indices of the recommended movies
    movie_indices = [i[0] for i in scores[:num_recommendations]]

    # Return the titles of the recommended movies
    recommended_movies = df_aux['original_title'].iloc[movie_indices]
    return recommended_movies

In [19]:
recommend_movies('Guardians of the Galaxy', cosine_sim_matrix)

79                             Iron Man 2
182                               Ant-Man
16                           The Avengers
7                 Avengers: Age of Ultron
26             Captain America: Civil War
174                   The Incredible Hulk
31                             Iron Man 3
68                               Iron Man
126                  Thor: The Dark World
169    Captain America: The First Avenger
Name: original_title, dtype: object

In [20]:
recommend_movies('The Dark Knight', cosine_sim_matrix)

119                               Batman Begins
3                         The Dark Knight Rises
10                             Superman Returns
9            Batman v Superman: Dawn of Justice
72                                Suicide Squad
163                                    Watchmen
1035                                  Jonah Hex
3854    Batman: The Dark Knight Returns, Part 2
299                              Batman Forever
303                                    Catwoman
Name: original_title, dtype: object