# Content-based recommendations
432601@student.saxion.nl
Fabian Mijsters

## Importing usefull libraries and loading csv data into a pandas dataframe

In [39]:
from IPython.core.display import HTML
from movie_display import movie_display
from IPython.display import display
from IPython.display import clear_output
from ipywidgets import Output

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import ipywidgets as widgets


movies_filename = 'dataset/movies.csv'
ratings_filename = 'dataset/ratings.csv'
links_filename = 'dataset/links.csv'
tags_filename = 'dataset/tags.csv'
imdbdata_filename = 'dataset/imdbdata.json'

movies_dataset = pd.read_csv(movies_filename)
ratings_dataset = pd.read_csv(ratings_filename)
links_dataset = pd.read_csv(links_filename)
tags_dataset = pd.read_csv(tags_filename)
imdb_dataset = pd.read_json(imdbdata_filename)

movie_profile = imdb_dataset.drop(['Poster','imdbVotes','Year'],axis=1)
movie_profile.head(10)

Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Plot,Production,Rated,Released,Runtime,Title,Writer,imdbId,imdbRating
0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",Nominated for 3 Oscars. Another 23 wins & 18 n...,USA,John Lasseter,"Animation, Adventure, Comedy",English,A cowboy doll is profoundly threatened and jea...,Buena Vista,G,22 Nov 1995,81 min,Toy Story,"John Lasseter (original story by), Pete Docter...",114709,8.3
1,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",4 wins & 9 nominations.,USA,Joe Johnston,"Action, Adventure, Family","English, French",When two kids find and play a magical board ga...,Sony Pictures Home Entertainment,PG,15 Dec 1995,104 min,Jumanji,"Jonathan Hensleigh (screenplay), Greg Taylor (...",113497,6.9
2,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",2 wins & 2 nominations.,USA,Howard Deutch,"Comedy, Romance",English,John and Max resolve to save their beloved bai...,Warner Home Video,PG-13,22 Dec 1995,101 min,Grumpier Old Men,"Mark Steven Johnson (characters), Mark Steven ...",113228,6.6
3,"Whitney Houston, Angela Bassett, Loretta Devin...",8 wins & 8 nominations.,USA,Forest Whitaker,"Comedy, Drama, Romance",English,"Based on Terry McMillan's novel, this film fol...",Twentieth Century Fox Home Entertainment,R,22 Dec 1995,124 min,Waiting to Exhale,"Terry McMillan (novel), Terry McMillan (screen...",114885,5.7
4,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Nominated for 1 Golden Globe. Another 1 win & ...,USA,Charles Shyer,"Comedy, Family, Romance",English,George Banks must deal not only with the pregn...,Disney,PG,08 Dec 1995,106 min,Father of the Bride Part II,"Albert Hackett (screenplay), Frances Goodrich ...",113041,5.9
5,"Al Pacino, Robert De Niro, Val Kilmer, Jon Voight",12 nominations.,USA,Michael Mann,"Action, Crime, Drama","English, Spanish",A group of professional bank robbers start to ...,Warner Bros.,R,15 Dec 1995,170 min,Heat,Michael Mann,113277,8.2
6,"Harrison Ford, Julia Ormond, Greg Kinnear, Nan...",Nominated for 2 Oscars. Another 2 wins & 4 nom...,"Germany, USA",Sydney Pollack,"Comedy, Drama","English, French",An ugly duckling having undergone a remarkable...,Paramount,PG,15 Dec 1995,127 min,Sabrina,"Samuel A. Taylor (play), Billy Wilder (earlier...",114319,6.3
7,"Jonathan Taylor Thomas, Brad Renfro, Eric Schw...",1 win & 5 nominations.,USA,Peter Hewitt,"Adventure, Comedy, Drama",English,Tom and Huck witness Injun Joe's killing of Do...,Buena Vista,PG,22 Dec 1995,97 min,Tom and Huck,"Mark Twain (novel), Stephen Sommers (screenpla...",112302,5.6
8,"Jean-Claude Van Damme, Powers Boothe, Raymond ...",,USA,Peter Hyams,"Action, Crime, Thriller",English,A former fireman takes on a group of terrorist...,MCA Universal Home Video,R,22 Dec 1995,111 min,Sudden Death,"Karen Elise Baldwin (story), Gene Quintano (sc...",114576,5.7
9,"Pierce Brosnan, Sean Bean, Izabella Scorupco, ...",Nominated for 2 BAFTA Film Awards. Another 2 w...,"UK, USA",Martin Campbell,"Action, Adventure, Thriller","English, Russian, Spanish",James Bond teams up with the lone survivor of ...,MGM/UA,PG-13,17 Nov 1995,130 min,GoldenEye,"Ian Fleming (characters), Michael France (stor...",113189,7.2


## Stemming and removing stopwords

Stop words are words that generally occur a lot in a sentence e.g. the a an in. By removing these words a sentence is created that has a higher descriptive value per word. 

Stemming is reducing words to their original stemmed version. e.g. walked -> walk. By stemming words a sentence is created that has a higher similarity value to other sentences with a different stemmed version of a word. 

In [40]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk


def clean_words(sentence_list):
    sw = stopwords.words("english")
    stemmer = SnowballStemmer("english")

    stripped_and_stemmed_sentences_list = list()
    for sentence in sentence_list:
        new_sentence = ''
        for word in sentence.split(' '):
            if re.sub(r'\W+', '', word) in sw:
                continue
            word = word.lower()
            stemmed_word = stemmer.stem(word)
            new_sentence = new_sentence + stemmed_word + ' '
        stripped_and_stemmed_sentences_list.append(new_sentence)
    return stripped_and_stemmed_sentences_list

## Cosine similarity 

Cosine similarity is a way of defining the similarity between 2 sentences. It measures the angle between 2 vectors in a multidimensional space. An advantage of cosine similarity compared to eucladian space is that cosine similarity doesnt take the size of the vectors into account which the eucladian space does. 

In [41]:
def get_cosine_similarity(tfidf,index_movie):
    return cosine_similarity(tfidf[index_movie],tfidf).flatten()

## Named entity recognition

Using named entity recognition the names get extracted from a string. Part of speech tagging is used to label the individual words in a sentence. The words that are tagged as a named entity get saved and appended together. The list of names is returned

In [42]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

def get_names_from_text(text):
    chunked_text = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    entire_chunk = []
    chunk = []

    for i in chunked_text:
        if type(i) == Tree:
            chunk.append(" ".join([label for label, part_of_speech in i.leaves()]))
        elif chunk:
            named_entity = " ".join(chunk)
            if named_entity not in entire_chunk:
                entire_chunk.append(named_entity)
                chunk = []
        else:
            continue

    if entire_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in entire_chunk:
            entire_chunk.append(named_entity)

    return entire_chunk

Simple helper function that passes a list of possible names to the get_names_from_text function and adds the names to a list if no names are found Unkown gets added to the list

In [43]:
def get_names_from_text_string(list_of_possible_names):
    name_list = []
    for name_string in list_of_possible_names:
        name_string = name_string.replace('.','')
        splitted_names = get_names_from_text(name_string)
        if len(splitted_names) > 0:
            new_name_string = ''
            for name in splitted_names:
                name = name.replace(' ','')
                new_name_string = new_name_string + " " + name
            name_list.append(new_name_string)
        else: 
            name_list.append('Unknown')
    return name_list


Clean all the used features with their respective techinques. Stopword removal and stemming for sentences and named entity recognition for the names of the directors, actors and writers.

In [44]:
rated_list = clean_words(imdb_dataset.Rated.values)
language_list = clean_words(imdb_dataset.Language.values)
# writer_list = clean_words(imdb_dataset.Writer.values)
plot_list = clean_words(imdb_dataset.Plot.values)
title_list = clean_words(imdb_dataset.Title.values)
genre_list = clean_words(imdb_dataset.Genre.values)
# director_list = clean_words(imdb_dataset.Director.values)
first_actor_list = clean_words(imdb_dataset.Actors.values)
actors_list = get_names_from_text_string(imdb_dataset.Actors.values)
director_list = get_names_from_text_string(imdb_dataset.Director.values)
writer_list = get_names_from_text_string(imdb_dataset.Writer.values)


## Bag of words and TF-IDF

Bag of words and TF-IDF are ways of converting text into a vector. The bag of words model simply counts the occurance of all the words this method is very usefull for names since it doesnt give a value to a word based on the amount of times the word occures in all the names_list. The TF-IDF model is very usefull for standard sentences. The tf-idf model counts the occurences of a word and scales it based on the amount of occurencies. This means that a word like dinosaur has a higher score than the word man since dinosaur will occur alot less than the word man. Using this method each word of a sentence is scored based on the descriptive value of the word to the sentence(movie). 

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
vectorizer = TfidfVectorizer()

tfidf_rated = vectorizer.fit_transform(rated_list)
tfidf_writer = vectorizer.fit_transform(writer_list)
bow_writer = count_vect.fit_transform(writer_list)
tfidf_language = vectorizer.fit_transform(language_list)
tfidf_plot = vectorizer.fit_transform(plot_list)
tfidf_title = vectorizer.fit_transform(title_list)
tfidf_genre = vectorizer.fit_transform(genre_list)
bow_director = count_vect.fit_transform(director_list)
tfidf_director = vectorizer.fit_transform(director_list)
bow_actors = count_vect.fit_transform(actors_list)
tfidf_actors = vectorizer.fit_transform(first_actor_list)


The get_movie_recommendations_for_movies adds all the similarities values together and multiply each individual score by a certain weight this weight is based on ab testing of the recommended movies. The weight implies how much a certain feature says about a movie. The plot gets weight heavier than the director since a plot of a movie is generally more important to a user than the director is. The genre weights the most since movies can have similar plots like toy story and chucky both movies are about toys comming to life but a young user who liked toy story probably doesnt want to see the horror movie chucky.

When the get_cosine_similarity functions are uncommented the difference is shown between the TF-IDF model and the BOW model on the names. A movie with the same part of the name of an actor is more likely to be recommended. e.g. Robert De Niro has a high similarity with Robert Duval since the first names are the same. This is not good for the recommendation system since the first name of an actor doesn't make them the same actor. 

In [53]:
def get_movie_recommendations_for_movies(movie_indices_input,
                                         plot_weight=2,
                                         title_weight=1,
                                         language_weight=0.3,
                                         writer_weight=1,
                                         genre_weight=3,
                                         director_weight=0.9,
                                         actors_weight=0.9,
                                         rated_weight=0.9):
    total_similarity_list = [0] * len(title_list)
    for movie_index in movie_indices_input:
        cosine_similarity_rated = get_cosine_similarity(tfidf_rated,movie_index)
#         cosine_similarity_writer = get_cosine_similarity(tfidf_writer,movie_index)
        cosine_similarity_writer = get_cosine_similarity(bow_writer,movie_index)
        cosine_similarity_language = get_cosine_similarity(tfidf_language,movie_index)
        cosine_similarity_plot = get_cosine_similarity(tfidf_plot,movie_index)
        cosine_similarity_title = get_cosine_similarity(tfidf_title,movie_index)
        cosine_similarity_genre = get_cosine_similarity(tfidf_genre,movie_index)
#         cosine_similarity_director = get_cosine_similarity(tfidf_director,movie_index)
        cosine_similarity_director = get_cosine_similarity(bow_director,movie_index)
#         cosine_similarity_actors = get_cosine_similarity(tfidf_actors,movie_index)
        cosine_similarity_actors = get_cosine_similarity(bow_actors,movie_index)


        for i in range(len(title_list)):
            total_similarity = 0
            total_similarity += cosine_similarity_plot[i] * plot_weight
            total_similarity += cosine_similarity_title[i] * title_weight
            total_similarity += cosine_similarity_language[i] *language_weight
            total_similarity += cosine_similarity_writer[i] * writer_weight
            total_similarity += cosine_similarity_genre[i] * genre_weight
            total_similarity += cosine_similarity_director[i] * director_weight
            total_similarity += cosine_similarity_actors[i] * actors_weight
            total_similarity += cosine_similarity_rated[i] * rated_weight
            
            total_similarity_list[i] += total_similarity
    
    return total_similarity_list

The prep movies function is a function that returns the imdb_dataset values based on the id of a movie. If the sort parameter is passed the movies are sorted based on imdb score. 

In [47]:
def prep_movies(similar_movie_indices,sort,movie_indices):
    movie_list = list()
    if type(sort) != type(None):
        for i in movie_indices:
            movie_list.append(imdb_dataset.iloc[i])
    for i in similar_movie_indices:
        if i in movie_indices:
            continue
        movie_list.append(imdb_dataset.iloc[i])
    if type(sort) != type(None):
        sort = np.flip(sort)
        new_movie_list = [None] * len(similar_movie_indices)
        for i,s in enumerate(sort):
            if s < len(movie_list):
                if movie_list[s]['imdbId'] in imdb_dataset.iloc[movie_indices].imdbId.values:
                    continue
                new_movie_list[i] = movie_list[s]
        movie_list = new_movie_list
        new_holder = list()
        for hold in movie_list: 
            if type(hold) != type(None):
                new_movie_list.append(hold)
        movie_list = new_movie_list
    return movie_list

Instantiate a variable with all the Title values sorted in alphabetical order

In [48]:
dropdown_title = widgets.Dropdown(options = np.sort(imdb_dataset.Title))


The show_dashboard function shows a dashboard to the user where he/she can select a movie Title, the amount of recommendations the user wants and if the user wants to he/she can change the weights that are used in the movie recommendations function. The dashboard is build using ipywidgets. It keeps track of the movies the user added to its watched list using the add button. When the user presses the find similarities button the N selected recommendations are shown to the user

In [49]:
import ipywidgets as ipy

def show_dashboard():
    movie_indices = list()
    out = Output()
    out2 = Output()

    def btn_eventhandler(obj):
        with out:
            clear_output()
        n_similarities = int(input_widget2.value) + len(movie_indices) + 1
        total_similarity_list = get_movie_recommendations_for_movies(movie_indices,
                                                                     genre_weight=sl.value,
                                                                     plot_weight=sl2.value,
                                                                     title_weight=sl3.value,
                                                                     writer_weight=sl4.value,
                                                                     director_weight=sl5.value,
                                                                     actors_weight=sl6.value)

        similar_movie_indices = np.asarray(total_similarity_list).argsort()[:-n_similarities:-1]
        sort = None
        if checkbox.value:
            rating_values = imdb_dataset.loc[similar_movie_indices].imdbRating.astype(float)
            sort = np.argsort(rating_values.values)

        with out:
            print("These are your recommendations")
            display(HTML(movie_display.show(prep_movies(similar_movie_indices,sort,movie_indices))))

    def btn_eventhandler2(obj):
        try: 
            with out2:
                clear_output()
            movie_indices.append(imdb_dataset.index[imdb_dataset['Title'] == dropdown_title.value].tolist()[0])
            with out2:
                print("You selected these movie's")
                selected_movies = list()
                for i in movie_indices:
                    selected_movies.append(imdb_dataset.loc[i])
                display(HTML(movie_display.show(selected_movies)))
        except:
            print("Please insert an int")

    def btn_eventhandler3(obj):
        movie_indices.clear()
        sl.value = 3
        sl2.value = 2
        sl3.value = 1
        sl4.value = 1
        sl5.value = 0.9
        sl6.value = 0.9
        sl7.value = 0.3
        sl8.value = 0.9

        try: 
            with out2:
                clear_output()
            with out: 
                clear_output()
        except Exception as e:
            print(e)

    btn = widgets.Button(description='Find Similarities')    
    btn2 = widgets.Button(description='Add')
    btn3 = widgets.Button(description='Clear')    

    btn.on_click(btn_eventhandler)
    btn2.on_click(btn_eventhandler2)
    btn3.on_click(btn_eventhandler3)

    input_widget2 = widgets.FloatText(
        value=3,
        description='N similar:',
        disabled=False
    )

    sl=widgets.FloatSlider(description="Genre",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=3,
                           orientation='vertical',
                           continuous_update=False)
    sl2=widgets.FloatSlider(description="Plot",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=2,
                           orientation='vertical',
                           continuous_update=False)
    sl3=widgets.FloatSlider(description="Title",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=1,
                           orientation='vertical',
                           continuous_update=False)
    sl4=widgets.FloatSlider(description="Writer",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=1,
                           orientation='vertical',
                           continuous_update=False)
    sl5=widgets.FloatSlider(description="Director",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=0.9,
                           orientation='vertical',
                           continuous_update=False)
    sl6=widgets.FloatSlider(description="Actor",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=0.9,
                           orientation='vertical',
                           continuous_update=False)
    sl7=widgets.FloatSlider(description="Language",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=0.3,
                           orientation='vertical',
                           continuous_update=False)
    sl8=widgets.FloatSlider(description="Rated",
                           min=0, 
                           max=10.0, 
                           step=0.1,
                           value=0.9,
                           orientation='vertical',
                           continuous_update=False)
    checkbox = widgets.Checkbox(
               description='Sort on rating',)

    
    print("Select movie title")
    widget_list = ipy.HBox(children=(sl,sl2,sl3,sl4,sl5,sl6,sl7,sl8), layout = ipy.Layout())

    display(dropdown_title)
    display(input_widget2)
    display(checkbox)
    display(widget_list)
    display(btn2)
    display(btn)
    display(btn3)
    display(out2)
    display(out)



In [50]:
show_dashboard()

Select movie title


Dropdown(options=('#DUPE#', '#DUPE#', '#DUPE#', '$9.99', "'Hellboy': The Seeds of Creation", "'I Know Where I'…

FloatText(value=3.0, description='N similar:')

Checkbox(value=False, description='Sort on rating')

HBox(children=(FloatSlider(value=5.0, continuous_update=False, description='Genre', max=10.0, orientation='ver…

Button(description='Add', style=ButtonStyle())

Button(description='Find Similarities', style=ButtonStyle())

Button(description='Clear', style=ButtonStyle())

Output()

Output()