In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt


In [None]:
def clean_title(title):
    #[^a-zA-Z0-9 ] matches anything that is not a letter (a-z or A-Z), a digit (0-9), or a space.
    #The replacement is an empty string, meaning any matched character will be removed.
    return re.sub("[^a-zA-Z0-9 ]", "",title)

In [None]:
# Load df and Ratings Data

df = pd.read_csv('movies.csv')  # Load the df dataset
ratings = pd.read_csv('ratings.csv')  # Load the ratings dataset

# Clean Movie Titles
df['clean_title'] = df['title'].apply(clean_title)

# Compute Average Ratings
movie_avg_ratings = ratings.groupby('movieId')['rating'].mean()
df['average_rating'] = df['movieId'].map(movie_avg_ratings)

# Replace NaN values with 0 for df with no ratings
df['average_rating'] = df['average_rating'].fillna(0)

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfdif = vectorizer.fit_transform(df["clean_title"])

# Extract genres into lists
df['genres_list'] = df['genres'].str.split('|')

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres_list'])  # Create genre matrix

# Compute genre-based similarity
genre_similarity = cosine_similarity(genre_matrix)

# Optional: Combine TF-IDF similarity and genre similarity
def combined_similarity(tfidf_similarity, genre_weight=0.5):
    # Normalize genre similarity
    norm_genre_similarity = genre_similarity / genre_similarity.max()
    return (1 - genre_weight) * tfidf_similarity + genre_weight * norm_genre_similarity

In [None]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfdif).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = df.iloc[indices][::-1]
    if results.empty:
        return pd.DataFrame(columns=["movieId", "title", "genres"])  # Empty DataFrame
    return results

In [None]:
def find_similar_movies(movie_id): 
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(df, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:
# Input Widget
movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

# Visualization Function
def visualize_recommendations(rec_percentages, num_recommendations=5):
    """Visualize the top N recommended movies as a bar chart."""
    top_movies = rec_percentages.head(num_recommendations)  # Get top N movies

    # Create horizontal bar chart
    plt.barh(top_movies['title'], top_movies['score'], color='skyblue')
    plt.xlabel('Recommendation Score')
    plt.ylabel('Movie Title')
    plt.title(f'Top {num_recommendations} Recommended Movies')
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.show()

# Widget Callback
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            # Search for similar movies
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            recommendations = find_similar_movies(movie_id)
            
            if recommendations.empty:
                fallback = df.nlargest(10, 'average_rating')[["title", "genres"]]
                fallback = fallback.reset_index()  # Reset the index and add it as a column
                fallback.rename(columns={"index": "Index"}, inplace=True)  # Rename the index column to "Index"
                
                print("No personalized recommendations found. Showing highly rated movies instead:")
                display(fallback)
                return

            # Visualize Recommendations
            visualize_recommendations(recommendations)
            
            # Display Recommendations
            display(recommendations)

# Observe User Input
movie_name_input.observe(on_type, names='value')

# Display Widgets
display(movie_name_input, recommendation_list)