In [97]:
import pandas as pd 

movies = pd.read_csv('movies.csv')

In [98]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [99]:
movies['title'] = movies['title'].fillna('')

This fixed an error message I received that can occur while using TfidfVectorizer().fit_transform() when there are NaN values in my data. It expected a byte or a unicode string, but it encountered a NaN value, which is a typical indicator of missing data in pandas.


In [100]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [101]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

tfidf = vectorizer.fit_transform(movies['title'])

In [122]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [104]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 2:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [105]:
ratings = pd.read_csv("ratings.csv")

In [106]:
movie_id = 2021
movie = movies[movies["movieId"] == movie_id]

In [107]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [108]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [109]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [110]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [111]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [112]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,1.000000,0.010315
260,0.625712,0.219211
2571,0.609880,0.240743
541,0.582647,0.113450
1196,0.578214,0.185494
...,...,...
4262,0.103230,0.035957
1234,0.102597,0.043594
1262,0.101963,0.029189
5349,0.101330,0.032762


In [113]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [114]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [115]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
1932,1.0,0.010315,96.942369,2021,Dune (1984),Adventure|Sci-Fi,Dune 1984
2015,0.181761,0.009087,20.001768,2105,Tron (1982),Action|Adventure|Sci-Fi,Tron 1982
1530,0.103863,0.006807,15.257724,1587,Conan the Barbarian (1982),Action|Adventure|Fantasy,Conan the Barbarian 1982
2103,0.149462,0.01025,14.581516,2193,Willow (1988),Action|Adventure|Fantasy,Willow 1988
2050,0.172261,0.011818,14.576188,2140,"Dark Crystal, The (1982)",Adventure|Fantasy,Dark Crystal The 1982
2780,0.10513,0.008035,13.083279,2872,Excalibur (1981),Adventure|Fantasy,Excalibur 1981
2876,0.138062,0.011021,12.52723,2968,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,Time Bandits 1981
3601,0.129829,0.011041,11.759281,3702,Mad Max (1979),Action|Adventure|Sci-Fi,Mad Max 1979
1242,0.169728,0.016509,10.281185,1275,Highlander (1986),Action|Adventure|Fantasy,Highlander 1986
2071,0.131096,0.013582,9.65227,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy,NeverEnding Story The 1984


In [116]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(20).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [117]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False)

recommendation_list = widgets.Output()


def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 2:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
        return recommendation_list
movie_name_input.observe(on_type, names='value')


display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()

In [124]:
import customtkinter as ctk
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk

def find_movies(event=None):
    movie_title = movie_box.get()
    recommendation_box.delete('1.0', ctk.END)
    try:
        if len(movie_title) > 2:
            results = search(movie_title)
            movie_id = results.iloc[0]["movieId"]
            similar_movies_results = find_similar_movies(movie_id)

            for _, movie in similar_movies_results.iterrows():
                recommendation_box.insert(ctk.END,
                                          f"Title: {movie['title']} | Rating: {movie['score']} | Genres: {', '.join(movie['genres'].split('|'))}\n\n")
    except Exception as ex:
        messagebox.showerror("Error", str(ex))


ctk.set_appearance_mode("Dark")
ctk.set_default_color_theme("blue")

app = ctk.CTk()
app.geometry("1920x1080")
app.title("Movie Recommendation System")

bg_image = tk.PhotoImage(file="dune_sardaukar.png")

bg_label = tk.Label(app, image=bg_image)
bg_label.place(x=0, y=0, relwidth=1, relheight=1)

title = ctk.CTkLabel(app, text="Enter a Movie Title", font=("Arial", 32), width=100, height=50)
title.pack(padx=10, pady=10)

movie_box = ctk.CTkEntry(app, width=920, height=40, corner_radius=30, border_width=1, font=("Arial", 16))
movie_box.pack(padx=10, pady=10)
movie_box.bind('<KeyRelease>', find_movies)

recommendation_box = ctk.CTkTextbox(app, width=920, height= 480, corner_radius=30, border_width=1)
recommendation_box.pack(padx=10, pady=10)

app.state('zoomed')
app.mainloop()



In [119]:
# dune_pic = ctk.CTkImage(light_image=Image.open("dune_sardaukar.png"), dark_image=Image.open("dune_sardaukar.png"), size=(920, 480))

# dune_label = ctk.CTkLabel(app, text="", image=dune_pic)
# dune_label.pack(padx=10, pady=10)