In [15]:
#Importing necessary files
import pandas as pd
import re
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


# Search Movie by Title

In [4]:
#Cleans movie titles for consistent text matching
def clean_title(title):
    title = title.lower()
    return re.sub("[^a-z0-9 ]", "", title)

In [5]:
#Applying the clean title function
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii 1995
...,...,...,...,...
62418,209157,We (2018),Drama,we 2018
62419,209159,Window of the Soul (2001),Documentary,window of the soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing 2001


In [7]:
#Converts cleaned movie titles into TF-IDF vectors for similarity comparison
vectorizer = TfidfVectorizer(ngram_range=(1,2))
    
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
#This cell searches for the closest matching movies by sorting cosine similarity scores.
#If there isn't a strong match, it returns an empty result (so the UI can show "not found").
def search(title, top_n=5, min_similarity=0.15):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])

    similarity = cosine_similarity(query_vec, tfidf).flatten()

    #Get top matches in correct (sorted) order
    top_indices = np.argsort(similarity)[::-1][:top_n]
    top_scores = similarity[top_indices]

    #If the best match is too weak, treat it as "not found"
    if len(top_scores) == 0 or top_scores[0] < min_similarity:
        return movies.iloc[0:0]  #empty data

    results = movies.iloc[top_indices].copy()
    results["similarity"] = top_scores
    return results

In [9]:
#Generates personalized movie recommendations using collaborative filtering
def find_similar_movies(movie_id):
    #Identify users who rated the target movie highly
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    
    #Find other movies those users also rated highly
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    #Normalize counts by number of similar users
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    
    #Filter out weak recommendations
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    #Collect ratings from all users for the candidate movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]    
    
    #Compute overall popularity for candidate movies
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())    
    
    #Combine similar-user preference with global popularity
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]  
    
    #Compute final recommendation score
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]    
    
    #Rank movies by recommendation strength
    rec_percentages = rec_percentages.sort_values("score", ascending=False)    
    
    #Return top recommended movies with metadata
    return rec_percentages.head(10).merge(
        movies, left_index=True, right_on="movieId"
    )[["score", "title", "genres"]]

In [10]:
#Creates an interactive text input to preview movie title search results
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

#Output area to display matching movie titles
movie_list = widgets.Output()

def on_type(data):
    #Updates search results dynamically as the user types
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

#Attach the event handler to the text input
movie_input.observe(on_type, names='value')

#Display the search widget and results
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [11]:
#Auto-selects the best matching movie title and displays recommendations
movie_name_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

#Output area for displaying movie recommendations
recommendation_list = widgets.Output()

def pick_best_movie_id(search_results):
    #Selects the candidate movie with the highest number of ratings
    candidate_ids = search_results["movieId"].astype(int).tolist()

    rating_counts = ratings[ratings["movieId"].isin(candidate_ids)]["movieId"].value_counts()

    #Fallback to the top search result if rating counts are unavailable
    if rating_counts.empty:
        return int(search_results.iloc[0]["movieId"])

    return int(rating_counts.idxmax())

def on_type(change):
    #Triggers recommendation generation as the user types a movie title
    with recommendation_list:
        recommendation_list.clear_output()
        title = change["new"]

        if len(title) > 1:
            results = search(title)

            if results.empty:
                print("Movie not found in dataset (or no close match). Try adding the year.")
                return

            movie_id = pick_best_movie_id(results)
            display(find_similar_movies(movie_id))

#Attach the handler to the text input widget
movie_name_input.observe(on_type, names="value")

#Display the recommendation widget and output
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

# Search Movie by Genre 

In [12]:
#This cell recommends top movies in a given genre using average ratings. 
def top_movies_by_genre(genre, min_ratings=200, top_n=10):
    genre = genre.strip().lower()

    #Filter movies that contain the selected genre
    genre_movies = movies[movies["genres"].str.lower().str.contains(genre, na=False)]
    if genre_movies.empty:
        return pd.DataFrame(columns=["title", "genres", "avg_rating", "num_ratings"])

    #Join ratings with the genre-filtered movies
    genre_ratings = ratings.merge(
        genre_movies[["movieId", "title", "genres"]],
        on = "movieId",
        how = "inner"
    )

    #Compute average rating and rating count per movie
    summary = genre_ratings.groupby(["movieId", "title", "genres"]).agg(
        avg_rating = ("rating", "mean"),
        num_ratings = ("rating", "count")
    ).reset_index()

    #Filter for reliability and rank
    summary = summary[summary["num_ratings"] >= min_ratings]
    summary = summary.sort_values(["avg_rating", "num_ratings"], ascending=False)

    return summary.head(top_n)[["title", "genres", "avg_rating", "num_ratings"]]

In [13]:
#Creates a dropdown widget to browse top movies by genre
all_genres = sorted({g for gs in movies["genres"].dropna().str.split("|") for g in gs})

#Dropdown menu listing all available genres
genre_dropdown = widgets.Dropdown(
    options=all_genres,
    value=all_genres[0] if all_genres else None,
    description="Genre:",
    disabled=False
)

#Output area for displaying genre-based recommendations
genre_output = widgets.Output()

def on_genre_change(change):
    #Updates recommendations when the selected genre changes
    with genre_output:
        genre_output.clear_output()
        genre = change["new"]

        if genre is None:
            print("No genres available in the dataset.")
            return

        results = top_movies_by_genre(genre, min_ratings=200, top_n=10)

        if results.empty:
            print("Not enough ratings to rank movies reliably for this genre.")
            return

        display(results)

#Attach the handler to the genre dropdown
genre_dropdown.observe(on_genre_change, names="value")

#Display the genre selection widget and output
display(genre_dropdown, genre_output)

Dropdown(description='Genre:', options=('(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', …

Output()

# Exploring and Editing Datasets 

In [14]:
#Experiment:Step-by-steprecommendationlogicforToyStory(1995)
movie_id = 1

#Find users who rated the target movie highly
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

#Collect other movies that those users also rated highly
similar_user_recs = ratings[
    (ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)
]["movieId"]

#Normalize movie counts by number of similar users
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

#Filter out movies liked by too few similar users
similar_user_recs = similar_user_recs[similar_user_recs > .10]

#Gather ratings from all users for the candidate movies
all_users = ratings[
    (ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)
]

#Compute overall popularity of each candidate movie
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

#Combine similar-user preference with overall popularity
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

#Compute final recommendation score
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

#Rank movies by recommendation score
rec_percentages = rec_percentages.sort_values("score", ascending=False)

#Display top recommended movies with titles and genres
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,bugs life a 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,toy story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,monsters inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,finding nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,beauty and the beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,incredibles the 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,lion king the 1994
