In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Loading the data and creating the pivot table
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')
users = pd.read_csv('users.dat', sep='::', engine='python', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip'], encoding='ISO-8859-1')
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='ISO-8859-1')

In [4]:
data = pd.merge(pd.merge(ratings, users), movies)
pivot_table = data.pivot_table(index='user_id', columns='title', values='rating').fillna(0)

In [5]:
# Calculating cosine similarity
cosine_sim = cosine_similarity(pivot_table)

In [6]:
# Creating a function to get similar movies
def get_similar_movies(movie_title, cosine_sim):
    if movie_title not in pivot_table.columns:
        return "Movie not found in database"
    else:
        idx = pivot_table.columns.get_loc(movie_title)
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [sim_score[0] for sim_score in sim_scores if sim_score[0] < len(pivot_table.columns)]
        similar_movies = pivot_table.columns[movie_indices].tolist()
        return ', '.join(similar_movies)


In [7]:
# Testing the function
print(get_similar_movies('Babe (1995)', cosine_sim))

Tom Jones (1963), Risky Business (1983), Candyman (1992), Head Above Water (1996), Nutty Professor, The (1963), Man from Laramie, The (1955)
