In [None]:
!pip install -r requirements.txt

In [None]:
import boto3
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import ipywidgets as widgets
from IPython.display import display

In [None]:
# create a boto3 client for S3
s3 = boto3.client('s3')

# specify the S3 bucket and file path
bucket_name = 'final-year-project-bucket'
movies_file_path = 'movies.csv'
ratings_file_path = 'ratings.csv'

# use the s3 client to download the file from S3 and read it as a pandas dataframe
movies = pd.read_csv(f's3://{bucket_name}/{movies_file_path}')
ratings = pd.read_csv(f's3://{bucket_name}/{ratings_file_path}')

In [None]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(tfidf)

In [None]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    indices = knn_model.kneighbors(query_vec, return_distance=False)[0]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:
movie_name_input = widgets.Text(
    value='',
    placeholder='Enter a movie name',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

In [None]:
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)