In [1]:
# pip install scikit-learn


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies = pd.read_csv('movies2.csv')  # contains movieId, title, genres

In [4]:
df_movies = movies[['movieId', 'title', 'genres']].copy()

In [5]:
df_movies['indexcol'] = df_movies.index

In [6]:
df_movies['genres'] = df_movies['genres'].apply(lambda x: x.lower().replace('|', ' '))

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
count_matrix = tfidf.fit_transform(df_movies['genres'])

In [8]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [9]:
def get_cosine_similar_movies(movie_title, top_n=10):
    # Get the index of the movie that matches the title
    idx = df_movies[df_movies['title'] == movie_title]['indexcol'].values[0]

    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores (highest to lowest)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 5 most similar movies (excluding the first, which is itself)
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top similar movies
    return df_movies['title'].iloc[movie_indices]

In [10]:
top_similar_movies = get_cosine_similar_movies("Titanic (1997)")
print(top_similar_movies)

24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
34                                   Carrington (1995)
45                How to Make an American Quilt (1995)
48                        When Night Is Falling (1995)
73                                 Bed of Roses (1996)
82     Once Upon a Time... When We Were Colored (1995)
84                           Angels and Insects (1995)
103              Bridges of Madison County, The (1995)
129                           Frankie Starlight (1995)
Name: title, dtype: object
