<a href="https://colab.research.google.com/github/AmanDeep-pvt/Self-Projects/blob/main/Movie_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install numpy<2
!pip install surprise

/bin/bash: line 1: 2: No such file or directory


In [2]:
# movie_recommender.py

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# --- 1. Data Loading and Preprocessing ---

# Load the MovieLens dataset (using the small version)
# Ensure you have 'ratings.csv' and 'movies.csv' in the same directory
try:
    ratings = pd.read_csv('ratings.csv')
    movies = pd.read_csv('movies.csv')
except FileNotFoundError:
    print("Dataset files not found. Please download 'ratings.csv' and 'movies.csv' from MovieLens.")
    # exit() # Removed exit() to allow the rest of the code to be modified if needed

# Merge ratings and movies dataframes
if 'ratings' in locals() and 'movies' in locals():
    df = pd.merge(ratings, movies, on='movieId')

    # --- 2. Content-Based Filtering ---

    # This approach recommends movies similar to the ones a user has liked before.
    # We will use movie genres to determine similarity.

    # Create a TF-IDF Vectorizer Object. Remove all english stop words
    tfidf = TfidfVectorizer(stop_words='english')

    # Replace NaN with an empty string in the 'genres' column
    movies['genres'] = movies['genres'].fillna('')

    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(movies['genres'])

    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Construct a reverse map of indices and movie titles
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

    def get_content_based_recommendations(title, cosine_sim=cosine_sim, top_n=5):
        """
        Get top N movie recommendations based on content similarity.
        """
        if title not in indices:
            return f"Movie '{title}' not found in the dataset."

        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the top_n most similar movies
        sim_scores = sim_scores[1:top_n+1]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top_n most similar movies
        return movies['title'].iloc[movie_indices]

    # --- 3. Collaborative Filtering ---

    # This approach uses the user-item rating matrix to find similar users or items.
    # We will use the Surprise library with the SVD algorithm.

    # The Reader class is used to parse a file containing ratings.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

    # We'll use the famous SVD algorithm.
    algo = SVD()

    # Train the algorithm on the full dataset
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    def get_collaborative_filtering_recommendations(movie_title, top_n=5):
        """
        Get top N movie recommendations using collaborative filtering (SVD).
        """
        if movie_title not in movies['title'].values:
            return f"Movie '{movie_title}' not found in the dataset."

        # Get the movie ID for the given title
        movie_id = movies.loc[movies['title'] == movie_title, 'movieId'].iloc[0]

        # Get a list of all movie IDs
        all_movie_ids = ratings['movieId'].unique()

        # Predict ratings for all movies for a hypothetical user who likes the input movie.
        # We can create a "test user" who has rated the input movie highly.
        # Here, we simplify by predicting ratings for movies this user hasn't seen.

        # Get movie IDs that are not the input movie
        movies_to_predict = [mid for mid in all_movie_ids if mid != movie_id]

        # Create a dummy user ID
        dummy_user_id = 9999

        # Predict ratings for all other movies
        predictions = [algo.predict(dummy_user_id, movie_id_to_predict, 4) for movie_id_to_predict in movies_to_predict]

        # Sort predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get top N recommendations
        top_movie_ids = [pred.iid for pred in predictions[:top_n]]

        # Get movie titles from IDs
        recommended_movies = movies[movies['movieId'].isin(top_movie_ids)]['title']

        return recommended_movies

    # --- 4. Visualization ---

    def visualize_ratings_distribution():
        """
        Visualize the distribution of user ratings.
        """
        plt.figure(figsize=(8, 6))
        sns.countplot(x='rating', data=ratings)
        plt.title('Distribution of Movie Ratings', fontsize=15)
        plt.xlabel('Rating')
        plt.ylabel('Count')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

    def visualize_recommendations(movie_title, recommendations, method):
        """
        Visualize the recommendations for a given movie.
        """
        if isinstance(recommendations, str):
            print(recommendations)
            return

        plt.figure(figsize=(10, 5))
        # Using a placeholder series for similarity scores as we don't have them directly for collaborative filtering
        scores = pd.Series(range(len(recommendations), 0, -1), index=recommendations.index)

        sns.barplot(x=scores.values, y=recommendations.values, palette='viridis')
        plt.title(f'Top 5 {method} Recommendations for "{movie_title}"', fontsize=14)
        plt.xlabel('Recommendation Strength (Illustrative)')
        plt.ylabel('Movie Title')
        plt.show()

    # --- 5. Main Execution ---

    if __name__ == '__main__':
        # --- Demonstrate the Recommender System ---

        # Pick a movie to get recommendations for
        target_movie = 'Forrest Gump (1994)'

        print(f"Movie Recommendation System\n")
        print(f"Target Movie: {target_movie}\n")

        # Get and print content-based recommendations
        print("--- Content-Based Recommendations ---")
        content_recs = get_content_based_recommendations(target_movie)
        if isinstance(content_recs, str):
            print(content_recs)
        else:
            print(content_recs.to_string(index=False))
        print("\n" + "="*40 + "\n")

        # Get and print collaborative filtering recommendations
        print("--- Collaborative Filtering Recommendations ---")
        collab_recs = get_collaborative_filtering_recommendations(target_movie)
        if isinstance(collab_recs, str):
            print(collab_recs)
        else:
            print(collab_recs.to_string(index=False))
        print("\n" + "="*40 + "\n")

        # --- Generate Visualizations ---

        # Visualize the overall rating distribution
        print("Displaying visualization for overall movie ratings distribution...")
        visualize_ratings_distribution()

        # Visualize recommendations from both methods
        print(f"Displaying visualization for Content-Based recommendations for '{target_movie}'...")
        visualize_recommendations(target_movie, content_recs, "Content-Based")

        print(f"Displaying visualization for Collaborative Filtering recommendations for '{target_movie}'...")
        visualize_recommendations(target_movie, collab_recs, "Collaborative Filtering")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).