In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357277 sha256=6f4a810ed4f1e026abba6ff87f916afe30a112bfb7fb43de417ee623c9243eaf
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [2]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -o ml-latest-small.zip

--2024-09-30 22:42:53--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-09-30 22:42:55 (957 KB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib

#loads the movies from the dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
data = pd.merge(ratings, movies, on='movieId')

In [4]:
# prep the dataset
reader = Reader(rating_scale=(0.5, 5.0))
surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# splits the data set into training and test
trainset, testset = train_test_split(surprise_data, test_size=0.25)

# trains the machine learning model
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7868180991e0>

In [5]:
# this function updates the recommendation system with new user ratings
def rebuild_surprise_data():
    reader = Reader(rating_scale=(0.5, 5.0))
    global surprise_data
    surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [6]:
# function to search for a movie by title and help fix typos
def search_movie(title):
    matching_movies = movies[movies['title'].str.contains(title, case=False, na=False)]

    if matching_movies.empty:
        close_matches = difflib.get_close_matches(title, movies['title'], n=5, cutoff=0.6)
        if close_matches:
            print(f"\nDid you mean one of these?")
            for i, match in enumerate(close_matches, 1):
                print(f"{i}. {match}")
            return None, close_matches
        else:
            return None, []
    return matching_movies[['movieId', 'title']], []

# this function keeps asking for a movie title until the user enters a valid one
def search_and_select_movie():
    while True:
        search_title = input("Enter part of the movie title to search for: ")
        matching_movies, close_matches = search_movie(search_title)

        if matching_movies is not None:
            print("\nMovies found:")
            print(matching_movies.to_string(index=False))
            return matching_movies

        if close_matches:
            confirmation = input(f"Enter the number of the movie if you meant one of them, or 'no' to try again: ").strip().lower()
            if confirmation.isdigit():
                selected_index = int(confirmation) - 1
                if 0 <= selected_index < len(close_matches):
                    confirmed_movie = close_matches[selected_index]
                    return movies[movies['title'] == confirmed_movie][['movieId', 'title']]

        print("\nNo valid movie found. Please try again.")

In [7]:
# function to collect ratings from the user
def collect_ratings(user_id):
    user_movie_ids = []
    user_movie_titles = []

    while True:
        # ask the user to search for a movie
        matching_movies = search_and_select_movie()
        selected_movie_ids = input("Enter the movie ID(s) you want to add to your rating list (comma separated): ").split(',')
        for movie_id in selected_movie_ids:
            movie_id = int(movie_id.strip())
            user_movie_ids.append(movie_id)
            movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
            user_movie_titles.append(movie_title)

        # asks for user ratings
        print("Please enter ratings between 0.5 and 5.0, in increments of 0.5.")
        new_ratings = input(f"Enter your ratings for the following movies, in the same order (comma separated): {user_movie_titles}: ").split(',')

        new_ratings_df = pd.DataFrame({
            'userId': [user_id] * len(user_movie_ids),
            'movieId': [int(movie_id) for movie_id in user_movie_ids],
            'rating': [float(rating) for rating in new_ratings],
            'timestamp': [0] * len(user_movie_ids)
        })
        global ratings
        ratings = pd.concat([ratings, new_ratings_df], ignore_index=True)

        print("\nYour ratings have been successfully added:")
        print(ratings.tail())

        more_ratings = input("Would you like to rate another movie? (yes/no): ").strip().lower()
        if more_ratings == 'no':
            break

In [8]:
# function to recommend movies based on the genres of movies the user has already rated highly
def get_content_based_recommendations(user_id, n=5):
    user_highly_rated_movies = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4.0)]

    if len(user_highly_rated_movies) < 5:
        print(f"User '{user_id}' needs to rate at least 5 movies to receive content-based recommendations.")
        return []
    print("\nContent-Based Filtering: These recommendations are based on the genres of movies you rated highly. We recommend movies with similar content features (genres).")

    user_movie_indices = []
    for movie_id in user_highly_rated_movies['movieId']:
        user_movie_indices.append(movies[movies['movieId'] == movie_id].index[0])

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['genres'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    similarity_scores = cosine_sim[user_movie_indices].mean(axis=0)

    sim_scores = list(enumerate(similarity_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_n_movie_indices = [i[0] for i in sim_scores[:n]]
    recommended_movies = movies['title'].iloc[top_n_movie_indices]

    # Display the recommendations
    print("\nContent-based recommendations based on genres:")
    for i, movie in enumerate(recommended_movies, 1):
        print(f"{i}. {movie}")

In [9]:
# function to get recommendations based on what other similar users liked
def get_top_n_recommendations(user_id, n=5):
    if user_id not in ratings['userId'].unique():
        print(f"User '{user_id}' does not exist. Please provide ratings first.")
        return

    user_rated_movies = ratings[ratings['userId'] == user_id]['movieId']

    if len(user_rated_movies) < 5:
        print(f"User '{user_id}' needs to rate at least 5 movies to receive personalized recommendations.")
        return

    print("\nCollaborative Filtering: These recommendations are based on the preferences of users similar to you, who liked movies you rated highly.")

    # recommend movies that other users similar to the current user liked
    predictions = []
    all_movie_ids = ratings['movieId'].unique()

    for movie_id in all_movie_ids:
        if movie_id not in user_rated_movies.values:
            predicted_rating = svd.predict(user_id, movie_id).est
            predictions.append((movie_id, predicted_rating))

    predictions.sort(key=lambda x: x[1], reverse=True)

    top_n_movie_ids = [movie_id for movie_id, rating in predictions[:n]]

    recommended_movies = movies[movies['movieId'].isin(top_n_movie_ids)]['title'].tolist()

    print(f"\nCollaborative filtering recommendations for '{user_id}':")
    for i, movie in enumerate(recommended_movies, 1):
        print(f"{i}. {movie}")

In [10]:
# main function to run the system
def main():
    # ask the user to enter their name as a user id
    user_id = input("Enter your user ID (name): ")

    print("\nYou must rate at least 5 movies.")
    collect_ratings(user_id)

    # retrain the recommendation model with the new ratings
    rebuild_surprise_data()
    trainset, testset = train_test_split(surprise_data, test_size=0.25)
    svd.fit(trainset)

    get_top_n_recommendations(user_id, n=5)
    get_content_based_recommendations(user_id, n=5)

main()

Enter your user ID (name): John Doe

You must rate at least 5 movies.
Enter part of the movie title to search for: batman

Movies found:
 movieId                                           title
     153                           Batman Forever (1995)
     592                                   Batman (1989)
    1377                           Batman Returns (1992)
    1562                           Batman & Robin (1997)
    3213             Batman: Mask of the Phantasm (1993)
   26152                                   Batman (1966)
   27155               Batman/Superman Movie, The (1998)
   27311       Batman Beyond: Return of the Joker (2000)
   33794                            Batman Begins (2005)
   60979                    Batman: Gotham Knight (2008)
   79274               Batman: Under the Red Hood (2010)
   90603                         Batman: Year One (2011)
   95149          Superman/Batman: Public Enemies (2009)
   98124  Batman: The Dark Knight Returns, Part 1 (2012)
   99813