# Worksheet for Movie recommendation using KNN Algorithm

In [22]:
# Import libraries
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Load MovieLens dataset
# Replace the path with the path to your downloaded data
ratings = pd.read_csv('interview/ml-latest-small/ratings.csv')
movies = pd.read_csv('interview/ml-latest-small/movies.csv')

# Display the first few rows of the ratings dataset
print(ratings.head())
print(movies.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


## Visualising the movie dataset

In [23]:
# Calculate the average rating and the number of ratings for each movie
rating_stats = ratings.groupby('movieId').agg({'rating': ['mean', 'count']}).reset_index()
rating_stats.columns = ['movieId', 'average_rating', 'num_ratings']

# Merge with the movies DataFrame to get movie titles and genres
movie_data = pd.merge(rating_stats, movies, on='movieId')

# Prepare the data for visualization
movie_data['genres'] = movie_data['genres'].str.split('|')  # Split genres into lists
movie_data_exploded = movie_data.explode('genres')  # Create a separate row for each genre

# Create a scatter plot
fig = px.scatter(movie_data_exploded, 
                 x='average_rating', 
                 y='num_ratings', 
                 color='genres', 
                 hover_name='title',
                 title='Movie Ratings by Genre',
                 labels={'average_rating': 'Average Rating', 'num_ratings': 'Number of Ratings'},
                 opacity=0.6)

# Update layout for better visuals
fig.update_layout(
    xaxis_title='Average Rating',
    yaxis_title='Number of Ratings',
    yaxis=dict(title='Number of Ratings'),
)

# Show the plot
fig.show()

## User-movie Matrix

In [24]:
# Create a user-movie matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print(user_movie_matrix.head())



movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

## Function for recommending the movies using KNN algorithm

In [25]:
def recommend_movies(user_id, k=5, metric='cosine'):
    # Fit the KNN model
    knn = NearestNeighbors(n_neighbors=k, metric=metric)
    knn.fit(user_movie_matrix)

    # Find the index of the user
    user_index = user_id - 1  # Assuming user IDs are continuous and start from 1

    # Find the nearest neighbors
    distances, indices = knn.kneighbors(user_movie_matrix.iloc[user_index, :].values.reshape(1, -1))

    # Get movie recommendations
    recommended_movies = []
    for i in range(1, len(distances.flatten())):  # skip the first one as it's the user themselves
        neighbor_index = indices.flatten()[i]
        neighbor_ratings = user_movie_matrix.iloc[neighbor_index].values
        for movie_id, rating in enumerate(neighbor_ratings):
            if rating > 0 and movie_id not in user_movie_matrix.iloc[user_index].values:  # Recommend unseen movies
                recommended_movies.append((movie_id + 1, rating))  # movie_id + 1 to match movieId

    # Sort recommendations by rating
    recommended_movies = sorted(recommended_movies, key=lambda x: x[1], reverse=True)[:10]
    
    # Map movie IDs to titles, include check for valid IDs
    recommended_titles = []
    for movie in recommended_movies:
        movie_id = movie[0]
        title_series = movies[movies['movieId'] == movie_id]['title']
        if not title_series.empty:
            title = title_series.values[0]
            recommended_titles.append((title, movie[1]))
        else:
            print(f"Movie ID {movie_id} not found in the dataset.")
    
    return recommended_titles


## Function for creating an user interface

In [26]:
import tkinter as tk
from tkinter import ttk, messagebox


def on_recommend():
    try:
        user_id = int(user_id_entry.get())
        k = int(k_entry.get())
        metric = metric_var.get()
        
        if metric not in ['cosine', 'euclidean']:
            raise ValueError("Invalid metric selected.")
        
        recommendations = recommend_movies(user_id, k, metric)
        
        results_text.delete(1.0, tk.END)  # Clear previous results
        results_text.insert(tk.END, "Recommended Movies:\n")
        
        for index, (title, rating) in enumerate(recommendations, start=1):
            results_text.insert(tk.END, f"{index}. {title} (Predicted Rating: {rating:.2f})\n")
    except ValueError as e:
        messagebox.showerror("Input Error", str(e))

# Create the main window
root = tk.Tk()
root.title("Movie Recommendation System")

# User ID input
ttk.Label(root, text="Enter your User ID (1 to 610):").grid(column=0, row=0, padx=10, pady=10)
user_id_entry = ttk.Entry(root)
user_id_entry.grid(column=1, row=0, padx=10, pady=10)

# K value input
ttk.Label(root, text="Enter the number of neighbors (K):").grid(column=0, row=1, padx=10, pady=10)
k_entry = ttk.Entry(root)
k_entry.grid(column=1, row=1, padx=10, pady=10)

# Distance metric selection
ttk.Label(root, text="Select Distance Metric:").grid(column=0, row=2, padx=10, pady=10)
metric_var = tk.StringVar(value='cosine')
ttk.Radiobutton(root, text='Cosine', variable=metric_var, value='cosine').grid(column=0, row=3, padx=10, pady=5)
ttk.Radiobutton(root, text='Euclidean', variable=metric_var, value='euclidean').grid(column=1, row=3, padx=10, pady=5)

# Recommend button
recommend_button = ttk.Button(root, text="Get Recommendations", command=on_recommend)
recommend_button.grid(column=0, row=4, columnspan=2, padx=10, pady=10)

# Results area
results_text = tk.Text(root, width=100, height=10)
results_text.grid(column=0, row=5, columnspan=2, padx=10, pady=10)

# Run the application
root.mainloop()


Movie ID 98 not found in the dataset.
