In [1]:
import pandas as pd

# Sample movie dataset
data = {
    'title': ['The Shawshank Redemption', 'The Godfather', 'The Dark Knight', 'Pulp Fiction', 'The Lord of the Rings: The Return of the King'],
    'genre': ['Drama', 'Crime, Drama', 'Action, Crime, Drama', 'Crime, Drama', 'Adventure, Drama, Fantasy'],
    'director': ['Frank Darabont', 'Francis Ford Coppola', 'Christopher Nolan', 'Quentin Tarantino', 'Peter Jackson'],
    'actors': ['Tim Robbins, Morgan Freeman', 'Marlon Brando, Al Pacino', 'Christian Bale, Heath Ledger', 'John Travolta, Uma Thurman', 'Elijah Wood, Viggo Mortensen'],
    'ratings': [9.3, 9.2, 9.0, 8.9, 8.9]
}

df = pd.DataFrame(data)
print(df)


                                           title                      genre  \
0                       The Shawshank Redemption                      Drama   
1                                  The Godfather               Crime, Drama   
2                                The Dark Knight       Action, Crime, Drama   
3                                   Pulp Fiction               Crime, Drama   
4  The Lord of the Rings: The Return of the King  Adventure, Drama, Fantasy   

               director                        actors  ratings  
0        Frank Darabont   Tim Robbins, Morgan Freeman      9.3  
1  Francis Ford Coppola      Marlon Brando, Al Pacino      9.2  
2     Christopher Nolan  Christian Bale, Heath Ledger      9.0  
3     Quentin Tarantino    John Travolta, Uma Thurman      8.9  
4         Peter Jackson  Elijah Wood, Viggo Mortensen      8.9  


In [2]:
# Create a soup of features
def create_soup(x):
    return x['genre'] + ' ' + x['director'] + ' ' + x['actors']

df['soup'] = df.apply(create_soup, axis=1)
print(df[['title', 'soup']])


                                           title  \
0                       The Shawshank Redemption   
1                                  The Godfather   
2                                The Dark Knight   
3                                   Pulp Fiction   
4  The Lord of the Rings: The Return of the King   

                                                soup  
0   Drama Frank Darabont Tim Robbins, Morgan Freeman  
1  Crime, Drama Francis Ford Coppola Marlon Brand...  
2  Action, Crime, Drama Christopher Nolan Christi...  
3  Crime, Drama Quentin Tarantino John Travolta, ...  
4  Adventure, Drama, Fantasy Peter Jackson Elijah...  


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df['soup'] = df['soup'].fillna('')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['soup'])
print(tfidf_matrix.shape)


(5, 36)


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)


[[1.         0.0328427  0.0328427  0.0352168  0.03172283]
 [0.0328427  1.         0.08801574 0.09437815 0.02857314]
 [0.0328427  0.08801574 1.         0.09437815 0.02857314]
 [0.0352168  0.09437815 0.09437815 1.         0.03063861]
 [0.03172283 0.02857314 0.02857314 0.03063861 1.        ]]


In [5]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Function that takes in a movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return df['title'].iloc[movie_indices]

# Test the recommendation system
print(get_recommendations('The Godfather'))


3                                     Pulp Fiction
2                                  The Dark Knight
0                         The Shawshank Redemption
4    The Lord of the Rings: The Return of the King
Name: title, dtype: object
