In [25]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from annoy import AnnoyIndex

data = pd.read_csv("movies.csv")
print(data.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [26]:
# Preprocess the data (optional)
data['title'] = data['title'].str.extract('(.+) \(\d+\)')
data['genres'] = data['genres'].str.replace('|', ' ')

  data['genres'] = data['genres'].str.replace('|', ' ')


In [27]:
# Use the desired columns
data = data[['movieId', 'title', 'genres']]

In [28]:
# Check for and handle missing values (if any)
data = data.dropna()

In [29]:
# Convert 'genres' column into a list of strings
genres_list = data['genres'].tolist()
# Create a TF-IDF matrix for movie genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(genres_list)

In [30]:
# Build an Annoy index for efficient similarity calculation
num_items, num_features = tfidf_matrix.shape
annoy_index = AnnoyIndex(num_features, 'angular')  # Use 'angular' for cosine similarity

# Add items to the Annoy index
for i in range(num_items):
    vector = tfidf_matrix[i].toarray().flatten()
    annoy_index.add_item(i, vector)

# Build the index
annoy_index.build(10)

True

In [31]:
# Function to get movie recommendations
def get_recommendations(movie_title):
    idx = data[data['title'] == movie_title].index[0]
    similar_item_indices = annoy_index.get_nns_by_item(idx, 11)  # Get 11 for top 10 recommendations
    similar_movies = data.loc[similar_item_indices, 'title'].tolist()
    return similar_movies[1:]

In [32]:
# Test the movie recommendation system
movie_title = 'Adventure'
recommended_movies = get_recommendations(movie_title)
print("Recommended Movies for {}: \n{}".format(movie_title, recommended_movies))

Recommended Movies for Adventure: 
["Kestrel's Eye (Falkens öga)", 'Trials of Henry Kissinger, The', 'Man of Aran', 'Venus Boyz', 'Tromeo and Juliet', 'Loggerheads', 'Kiss of Death', 'Fay Grim', 'Day of the Outlaw', 'Turtles Are Surprisingly Fast Swimmers (Turtles Swim Faster Than Expected) (Kame wa igai to hayaku oyogu)']
