# Get the processed data

In [53]:
import pandas as pd

In [54]:
# get the processed data
movies = pd.read_csv(filepath_or_buffer='../data/processed/inputs.csv', sep=',')

In [55]:
movies.head()

Unnamed: 0,title,movie_id,tags
0,102 Dalmatians,10481,"Get ready for a howling good time as an all new assortment of irresistible animal heroes are unleashed in this great family tail! In an unlikely alliance, the outrageous Waddlesworth... a parrot who thinks he's a Rottweiler... teams up with Oddball... an un-marked Dalmation puppy eager to earn her spots! Together they embark on a laugh-packed quest to outwit the ever-scheming Cruella De Vil Comedy Family londonengland prison releasefromprison women'sprison societyforthepreventionofcrueltytoanimals puppy pelz dog dalmatian GlennClose IoanGruffudd AliceEvans KevinLima"
1,10 Cloverfield Lane,333371,"After a car accident, Michelle awakens to find herself in a mysterious bunker with two men named Howard and Emmett. Howard offers her a pair of crutches to help her remain mobile with her leg injury sustained from the car crash and tells her to ""get good on those"" before leaving the bunker. She has been given the information that there has been an alien attack and the outside world is poisoned. However, Howard and Emmett's intentions soon become questionable and Michelle is faced with a question: Is it better in here or out there? Thriller ScienceFiction Drama kidnapping bunker paranoia basement survivalist apocalypse caraccident captive MaryElizabethWinstead JohnGoodman JohnGallagherJr."
2,10 Days in a Madhouse,345003,"Nellie Bly, a 23 year-old reporter for Joseph Pulitzer, goes undercover in the notorious Blackwell's Island women's insane asylum in order to expose corruption, abuse and murder. Drama undercover insaneasylum reporter CarolineBarry ChristopherLambert KellyLeBrock"
3,10 Things I Hate About You,4951,"Bianca, a tenth grader, has never gone on a date, but she isn't allowed to go out with boys until her older sister Kat gets a boyfriend. The problem is, Kat rubs nearly everyone the wrong way. But Bianca and the guy she has her eye on, Joey, are eager, so Joey fixes Kat up with Patrick, a new kid in town just bitter enough for Kat. Comedy Romance Drama shakespeare sister highschool cannabis deception teenmovie shrew archery feel-goodending oppositesattract duringcreditsstinger teenageromance playadaptation overprotectivefather HeathLedger JuliaStiles JosephGordon-Levitt"
4,10th & Wolf,13197,"A former street tough returns to his Philadelphia home after a stint in the military. Back on his home turf, he once again finds himself tangling with the mob boss who was instrumental in his going off to be a soldier. Action Crime Drama Mystery Thriller undercover mafia mobster crimefamily JamesMarsden BrianDennehy LeoRossi RobertMoresco"


# Vectorize the data

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_model = TfidfVectorizer(stop_words = "english")
data_vec = vec_model.fit_transform(movies['tags'])

# Build the KNN

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

# construct the similarities matrix
similarities = cosine_similarity(data_vec)

In [58]:
def recommend(movie_title: str) -> list[str]:
    """
    This function takes a movie title as input and print the top 5 recommended movies based on cosine similarity.

    Args:
        movie_title (str): The title of the movie to find recommendations for.

    Returns:
        None
    """
    # find the index of the movie in the dataframe of all the movies
    movie_index = movies[movies['title'] == movie_title].index[0]

    # now get the distances from that move to the others
    distances = similarities[movie_index]

    # get the first 5 movies
    movie_list = sorted(list(enumerate(distances)), reverse = True , key = lambda x: x[1])[1:6]

    print("Recommended movies:\n")
    for i in movie_list:
        print(f'    {movies.iloc[i[0]].title}')

## Test the constructed KNN

In [59]:
# recommend a movie t
base_movie = 'How to Train Your Dragon'

recommend(base_movie)

Recommended movies:

    How to Train Your Dragon 2
    Dragon Nest: Warriors' Dawn
    Pete's Dragon
    George and the Dragon
    Dragon Hunters


We can see that the KNN model recommended related movies, so we can conclude it's working.

# Now create a KNN with Sklearn

In [60]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors = 5, algorithm = "brute", metric = "cosine")
model.fit(data_vec)

Create the recommender function

In [61]:
def recommend_with_sklearn_knn(movie_title: str) -> list[str]:
    """
    This function takes a movie title as input and returns the top 5 recommended movies based on a Sklearn KNN using cosine distance.

    Args:
        movie_title (str): The title of the movie to find recommendations for.

    Returns:
        l;ist[str: The top 5 recommended movies based on cosine similarity.
    """

    # find the index of the movie in the dataframe of all the movies
    movie_index = movies[movies['title'] == movie_title].index[0]

    # get the n nearist neighbors to the movie
    distances, indices = model.kneighbors(data_vec[movie_index])

    # get the first 5
    similar_movies = [(movies["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]

    return similar_movies[1:]

## Test the recommender

In [62]:
# recommend a movie to Batman
base_movie = 'How to Train Your Dragon'

recommended_movies = recommend_with_sklearn_knn(base_movie)

print("Recommended movies - cosine distance:\n")
for recommended_movie in recommended_movies:
    print(f'    {recommended_movie[0]} - {recommended_movie[1]}')


Recommended movies - cosine distance:

    How to Train Your Dragon 2 - 0.6819820376164203
    Dragon Nest: Warriors' Dawn - 0.7985962865090701
    Pete's Dragon - 0.8336301087483822
    George and the Dragon - 0.8435393205632739


We can see that we got similar movies, so we can conclude that both models are working as expected.