In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [18]:
#Load data from dataset
movies = pd.read_csv("../dataset/movies.csv")

In [19]:
#Select Columns
movies = movies[["Series_Title","Overview","Genre", "Director","Stars"]]

In [20]:
#Combine columns
movies["genre_labels"] = movies["Genre"] + " " + movies["Overview"] + " " + movies["Director"] + " "  + movies["Stars"] 

In [21]:
#Create new table 
movies_genre_parts = movies[["Series_Title", "genre_labels"]]

In [22]:
#Transform to matrix vector

from sklearn.feature_extraction.text import CountVectorizer
import pickle

cv = CountVectorizer(max_features=1000, stop_words='english')
cv_matrix = cv.fit_transform(movies_genre_parts["genre_labels"])
genre_vectors = cv_matrix.toarray()

In [23]:
# Train a KNN model
# This will train the KNN model using the default value of n_neighbors=5.

In [24]:
# Train a KNN model on the vectorized data
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors()
knn.fit(genre_vectors)

In [25]:
#Apply cosine_similarity on the vectorized KNN model

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(genre_vectors)

In [32]:
#Recommend movies and evaluate performance of the KNN model

import time
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Assuming you have the similarity matrix
# Initialize PCA and KNN
pca = PCA(n_components=1000)  # Use the number of components you want
similarity_reduced = pca.fit_transform(similarity)  # Apply PCA to reduce dimensionality

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
knn.fit(similarity_reduced)

def get_true_relevant_movies(movie_title):
    """
    Dynamically retrieve relevant movies based on the provided movie title.
    For simplicity, this function will just return a list of similar movies
    from the dataset as the relevant movies.
    """
    # Find movies that are similar to the input movie title
    similar_movies = movies_genre_parts[movies_genre_parts["Series_Title"].str.contains(movie_title, case=False, na=False)]
    
    # Return the list of similar movie titles
    return similar_movies["Series_Title"].tolist()

def recommend_and_evaluate(movie, k=10):
    # Start the timer
    start_time = time.time()
    
    # Normalize user input
    movie = movie.strip().lower()
    
    # Find the closest matching movie
    matching_movies = movies_genre_parts[movies_genre_parts["Series_Title"].str.lower() == movie]
    
    if matching_movies.empty:
        # If no exact match is found, use partial matching
        matching_movies = get_true_relevant_movies(movie)
        if not matching_movies:
            print(f"No close match found for '{movie}'. Please check the spelling.")
            return
        closest_match = matching_movies[0]  # Just pick the first match
    else:
        closest_match = movie
    
    # Find the index of the closest matching movie
    try:
        movie_index = movies_genre_parts[movies_genre_parts["Series_Title"].str.lower() == closest_match].index[0]
    except IndexError:
        print(f"Movie '{closest_match}' not found in the dataset.")
        return
    
    # Reduce the dimensionality of the input movie (same transformation as training)
    movie_reduced = pca.transform(similarity[movie_index].reshape(1, -1))
    
    # Perform the k-nearest neighbors search
    distances, indices = knn.kneighbors(movie_reduced, n_neighbors=k)
    indices = indices.flatten()  # Flatten the indices to use for Pandas indexing
    
    # Convert indices to integer for Pandas indexing
    recommended_movies = movies_genre_parts.iloc[indices]["Series_Title"].values[:k]

    # Display the recommended movies
    print(f"Top {k} recommended movies for '{closest_match}':")
    for i, title in enumerate(recommended_movies):
        print(f"{i+1}: {title}")

    # Get true relevant movies based on the closest match
    true_relevant_movies = get_true_relevant_movies(closest_match)

    # Calculate Precision@k
    precision_at_k = len(set(recommended_movies) & set(true_relevant_movies)) / k
    print(f"Precision@{k}: {precision_at_k}")
    
    # Calculate Recall@k
    recall_at_k = len(set(recommended_movies) & set(true_relevant_movies)) / len(true_relevant_movies)
    print(f"Recall@{k}: {recall_at_k}")
    
    # Calculate F1 Score
    if precision_at_k + recall_at_k > 0:
        f1_score = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
    else:
        f1_score = 0
    print(f"F1 Score: {f1_score}")
    
    # End the timer
    end_time = time.time()
    print(f"Time taken to recommend: {end_time - start_time:.4f} seconds")

# Get user input for the movie name
user_movie = input("Please enter the movie name: ")

# Call the function with user input
recommend_and_evaluate(user_movie, k=5)


Moana
Top 5 recommended movies for 'moana':
1: Moana
2: Aladdin
3: Wreck-It Ralph
4: Toy Story
5: Monsters, Inc.
Precision@5: 0.2
Recall@5: 1.0
F1 Score: 0.33333333333333337
Time taken to recommend: 0.0197 seconds


In [27]:
similarity

array([[1.        , 0.06933752, 0.07692308, ..., 0.08006408, 0.14322297,
        0.        ],
       [0.06933752, 1.        , 0.20801257, ..., 0.07216878, 0.06454972,
        0.08838835],
       [0.07692308, 0.20801257, 1.        , ..., 0.08006408, 0.07161149,
        0.04902903],
       ...,
       [0.08006408, 0.07216878, 0.08006408, ..., 1.        , 0.2236068 ,
        0.        ],
       [0.14322297, 0.06454972, 0.07161149, ..., 0.2236068 , 1.        ,
        0.09128709],
       [0.        , 0.08838835, 0.04902903, ..., 0.        , 0.09128709,
        1.        ]])

In [28]:
import pickle
pickle.dump(similarity, open('../pickle/knn.pkl', 'wb'))
pickle.load(open('../pickle/knn.pkl', 'rb'))

array([[1.        , 0.06933752, 0.07692308, ..., 0.08006408, 0.14322297,
        0.        ],
       [0.06933752, 1.        , 0.20801257, ..., 0.07216878, 0.06454972,
        0.08838835],
       [0.07692308, 0.20801257, 1.        , ..., 0.08006408, 0.07161149,
        0.04902903],
       ...,
       [0.08006408, 0.07216878, 0.08006408, ..., 1.        , 0.2236068 ,
        0.        ],
       [0.14322297, 0.06454972, 0.07161149, ..., 0.2236068 , 1.        ,
        0.09128709],
       [0.        , 0.08838835, 0.04902903, ..., 0.        , 0.09128709,
        1.        ]])

In [33]:
movies_genre_parts.head()

Unnamed: 0,Series_Title,genre_labels
0,The Shawshank Redemption,drama imprison men bond number year find solac...
1,The Godfather,crime drama organ crime dynasti age patriarch ...
2,The Dark Knight,action crime drama menac known joker wreak hav...
3,The Godfather: Part II,crime drama earli life career vito corleon s n...
4,12 Angry Men,crime drama juri holdout attempt prevent misca...
