In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the data from the provided CSV files
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')


# Create a user-movie matrix with users as rows, movies as columns, and ratings as values
user_movie_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)  # Fill missing values with 0

# Calculate the number of ratings each movie has received
movie_rating_counts = ratings_df.groupby('movieId').size()

# Filter movies that have received at least a threshold number of ratings (e.g., 50)
popular_movies = movie_rating_counts[movie_rating_counts >= 50].index
user_movie_matrix_filtered = user_movie_matrix[popular_movies]

# Calculate cosine similarity on the filtered user-movie matrix
movie_similarity_filtered = cosine_similarity(user_movie_matrix_filtered.T)
movie_similarity_filtered_df = pd.DataFrame(movie_similarity_filtered, index=user_movie_matrix_filtered.columns, columns=user_movie_matrix_filtered.columns)

# Function to extract movie title without year
def clean_movie_title(title):
    # Remove the year in parentheses, if present
    return re.sub(r'\s*\(\d{4}\)$', '', title).strip().lower()

# Function to get top 10 similar movies for a given movie title without requiring the year
def recommend_movies(movie_title, movies_df, movie_similarity_df, top_n=10):
    # Clean the input movie title to ignore the year
    cleaned_title = clean_movie_title(movie_title)
    
    # Find movieId(s) for movies matching the cleaned title
    movie_ids = movies_df[movies_df['title'].str.lower().apply(clean_movie_title) == cleaned_title]['movieId'].values
    if len(movie_ids) == 0:
        return f"Movie '{movie_title}' not found in the dataset."

    # Use the first matching movieId for recommendations
    movie_id = movie_ids[0]
    if movie_id not in movie_similarity_df.index:
        return f"Movie '{movie_title}' does not have enough ratings for recommendation."
    
    # Get similarity scores for the given movie
    similar_movies = movie_similarity_df[movie_id].sort_values(ascending=False)[1:top_n+1]
    
    # Get movie titles for the recommended movies
    recommended_titles = movies_df[movies_df['movieId'].isin(similar_movies.index)]['title'].values
    return recommended_titles

# Example recommendation without using the year in the title
recommend_movies("Toy Story", movies_df, movie_similarity_filtered_df)


array(['Star Wars: Episode IV - A New Hope (1977)', 'Forrest Gump (1994)',
       'Lion King, The (1994)', 'Jurassic Park (1993)',
       'Mission: Impossible (1996)',
       'Independence Day (a.k.a. ID4) (1996)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Groundhog Day (1993)', 'Back to the Future (1985)',
       'Toy Story 2 (1999)'], dtype=object)