# Content-based recommendations
Emanuel de Jong (495804) - Erik Markvoort (519894)

## Parameters
these global variables are the index of the movie which the reccomendations will be based on and the number of reccomendations that should be given.

In [1]:
WATCH_LIST = []
MOVIE_COUNT = 5

## Imports
these modules will be used to complete the assignment.

In [2]:
# Std
import re

# Local
from movie_display import movie_display

# Third party
import numpy as np
import pandas as pd
from IPython.core.display import HTML
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact_manual
import ipywidgets as widgets

## Read the imdbdata JSON

In [None]:
# imdb = pd.read_json("dataset/imdbdata.json")[:50]
imdb = pd.read_json("dataset/imdbdata.json")

## Global vars

In [4]:
feature_vectors = {}

## Helper functions
the function clean_persons is a repeated process where a list of names is transformed into a string which can be applied for a bag of words.

In [5]:
def clean_persons(persons):
    if isinstance(persons, str):  # Ensure the entry is a string
        persons = re.sub(r'\(.*?\)', '', persons).strip()
        persons = persons.replace("N/A", "")
        persons = persons.replace(" ", "")
        persons = persons.replace(",", " ")
        persons = persons.strip()
    return persons

## Actors BoW
For features that mostly consist of names a bag of words is applied, this includes actors, writers and directors. stemming is not applied and stopwords are not removed as names should remain unchanged. a bag of words is applied for names, and not a tf-idf since it is expected that a name does not appear more than once per movie.

In [6]:
def set_actors_bow():
    actorRows = imdb['Actors'].apply(clean_persons)

    count_vectorizer = CountVectorizer()
    bow_actors = count_vectorizer.fit_transform(actorRows)

    feature_vectors['Actors'] = bow_actors

## Writers BoWs

In [7]:
def set_writers_bow():
    writerRows= imdb['Writer'].apply(clean_persons)
    writerRows[0]
    count_vectorizer = CountVectorizer()
    bow_writer = count_vectorizer.fit_transform(writerRows)

    feature_vectors['Writer'] = bow_writer

## Director BoW

In [8]:
def set_director_bow():
    imdb['Director'] = imdb['Director'].apply(clean_persons)
    directorRows = imdb["Director"]

    count_vectorizer = CountVectorizer()
    bow_director = count_vectorizer.fit_transform(directorRows)

    feature_vectors['Director'] = bow_director

## Title BoW
similar to names, titles are not expected to have repeating words, which motivats a choice for a bag of words. a difference, however is the use of a stopwords filter, as titles are expected to have stopwords such as 'the' often. 

In [9]:
def set_title_bow():
    sw = set(stopwords.words("english"))
    title_rows = imdb["Title"]

    title_sentences = []
    for title in title_rows:
        title += " "
        words = re.findall(r"\b\w+(?:\.\w+)*(?:'\w+)?", title)
        title_sentences.append(words)

    filtered_titles = []
    for words in title_sentences:
        filtered = [w for w in words if not w.lower() in sw]
        title = ""
        for word in filtered:
            title += word + " "
        filtered_titles.append(title)
    count_vectorizer = CountVectorizer()
    bow_title = count_vectorizer.fit_transform(filtered_titles)

    feature_vectors['Title'] = bow_title    

## Plot TF-IDF
the data for the plot is handled differently, as it is expectd that words can appear more than once, which suggests the use of a tf-idf. stemming and stopwords are also applied since plots are full sentences, if not multiple. 

In [10]:
def set_plot_tf_idf():
    sw = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    plot_rows = imdb["Plot"]

    plot_sentences = []
    for plot in plot_rows:
        plot += " "
        words = re.findall(r"\b\w+(?:\.\w+)*(?:'\w+)?", plot)
        for i in range(len(words)):
            words[i] = stemmer.stem(words[i])
        plot_sentences.append(words)

    filtered_plots = []
    for words in plot_sentences:
        filtered = [w for w in words if not w.lower() in sw]
        filtered_plots.append(filtered)

    tf_idf_vectorizer = TfidfVectorizer()
    ids = imdb['imdbId'].tolist()  # Ensure IMDb IDs are in list format
    filtered_plot_strings = []

    for imdb_id, filtered_plot in zip(ids, filtered_plots):
        try:
            # Ensure filtered_plot is a single string
            if isinstance(filtered_plot, list):
                filtered_plot = ' '.join(filtered_plot)
            
            filtered_plot_strings.append(filtered_plot)
            
        except ValueError as e:
            print(f"Error with IMDb ID {imdb_id}: {e}")
            pass

    td_idf_plots = tf_idf_vectorizer.fit_transform(filtered_plot_strings)

    feature_vectors['Plot'] = td_idf_plots

## Getting the similar movies
For every feature, a cosine similarity is made up out the mean of the cosine similarities of all movies in the watch list. Then, for this feature, a ranking is made with the highest similarity first and the lowest last. With a ranking of movies for each feature, the features are merged by combining th ranking list which results into a list of totals of rankings. The top N movies are taken out of the ranking and put into a recommendation list. Movies originally in the watch list are also removed from the recommendations.

In [11]:
def calculate_mean(reccomendations):
    reccomendation = reccomendations[0]
    for i in range(1, len(reccomendations)):
        reccomendation += reccomendations[i]
    reccomendation /= len(reccomendations)
    return reccomendation

similar_movies = []

def get_similar_movies():
    global similar_movies
    cosine_similarities = {}
    for feature in feature_vectors.keys():
        cosine_sim = cosine_similarity(feature_vectors[feature])
        cosine_sim = MinMaxScaler().fit_transform(cosine_sim)

        if (feature == 'Plot'):
            cosine_sim *= 1.25
        
        cosine_similarities[feature] = cosine_sim
    
    cosines = {}
    for movie in WATCH_LIST:
        for feature in feature_vectors.keys():
            if feature not in cosines:
                cosines[feature] = []
            cosines[feature].append(cosine_similarities[feature][movie])

    reccomendations_positions = [[i, 0] for i in range(len(cosines['Actors'][0]))]

    means_by_feature = {}
    for feature in cosines.keys():
        means = calculate_mean(cosines[feature])
        means_by_feature[feature] = []
        for i in range(len(means)):
            means_by_feature[feature].append((i, means[i]))

    for feature in means_by_feature.keys():
        sorted_recommendations = sorted(means_by_feature[feature], key=lambda r: r[1], reverse=True)
        for i in range(len(sorted_recommendations)):
            reccomendations_positions[sorted_recommendations[i][0]][1] += i
    for movie in WATCH_LIST:
        reccomendations_positions[movie][1] = len(reccomendations_positions)
    watch_list_recommendations = sorted(reccomendations_positions, key=lambda r: r[1])

    filtered = [x for x in watch_list_recommendations if x not in WATCH_LIST]

    similar_movies = [r[0] for r in filtered[:MOVIE_COUNT]]


# showing movie information

To display the recommended movies they are shown in the provided html movie-display.

In [12]:
# Load movies into a dataframe
df = pd.read_json('./dataset/imdbdata.json', orient='columns')
movies_for_display = []
def display_movies():
    for i in range(len(similar_movies)):
        movies_for_display.append(df.iloc[similar_movies[i]])

def html_display():
    display(HTML(movie_display.show(movies_for_display)))


In the jupyter widget, you can fill in up to 10 movies in the text boxed to make a watch list to base the reccomendation on.

In [None]:
def set_watch_list(widgets, count):
    MOVIE_COUNT = count
    for widget in widgets:
        if widget.value != "":
            WATCH_LIST.append(int(widget.value))

text_widgets = []
for i in range(10):
    text_widget = widgets.Text(
        description=f'movie {i+1}:',
    )
    text_widgets.append(text_widget)

recommendation_slider = widgets.IntSlider(
    min=1, max=20, step=1, value=10, description="Amount of recommendations:"
)

# Button to submit the selections
submit_button = widgets.Button(description="Submit")

# Function to trigger when button is clicked
def on_submit_clicked(b):
    set_watch_list(widgets = text_widgets, count=recommendation_slider.value)
    set_actors_bow()
    set_director_bow()
    set_title_bow()
    set_writers_bow()
    set_plot_tf_idf()
    get_similar_movies()
    display_movies()
    html_display()

# Attach the button to the callback function
submit_button.on_click(on_submit_clicked)

# Display the widgets
for widget in text_widgets:
    display(widget)
display(recommendation_slider)
display(submit_button)