# Import and init

In [43]:
import os 
import json

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk 
from nltk.corpus import stopwords

import config

# Read data

In [44]:
df0 = pd.read_csv(config.INPUT_DATA_RELPATH)
df0.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


# Data preproc

## Prepared data (`df1`)

In [45]:
df1 = df0.copy(deep = True)

# Fill null values
df1 = (
    df1[
        ~df1['overview'].isnull()
    ]
    .reset_index(drop = True)
)

df1

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238
4796,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5
4797,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6
4798,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7


## Ingest to model (`df2`)

In [46]:
df2 = df1.copy(deep = True)
# Filter columns
# df2 = df2[['overview']]
df2 = df2['overview'].tolist()
print(len(df2))
df2[:5]

4800


['In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.',
 'Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.',
 'A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE.',
 "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a

# Text vectorisation (TF-IDF)

In [47]:
vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    lowercase = True, 
    stop_words = stopwords.words('english'),
    ngram_range = (1, 2),
    min_df = 2,
    # max_features = 40
)
tfidf_matrix = vectorizer.fit_transform(
    df2
)
vectorizer.get_feature_names_out()

array(['00', '00 agent', '000', ..., 'zookeeper', 'zoologists', 'zorro'],
      shape=(18898,), dtype=object)

In [48]:
tfidf_matrix_dense = tfidf_matrix.toarray()
tfidf_matrix_dense[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(5, 18898))

# Model recommendation

In [49]:
def get_index_of_movie_title(df: pd.DataFrame, 
                             title: str,
                             column: str = 'title'
                             ) -> int:
    """
    Based on the movie title, get its index in the corpus dataframe.
    If multiple matches are found, get the first one.

    Example usage:
    ```py
    get_index_of_movie_title(df1, which_word)
    >>> 7
    ```
    """
    return int(
        df[
            df[column] == title
        ]
        .index[0]
    )



In [50]:
import textdistance as td

def get_df_norm_simil(search_string, values_list, metric, metric_name: str):
    """
    Helper function mainly used in `search_for_the_title`.
    """
    indices_checked, dl_distances = [], []
    for index, i in enumerate(values_list):
        dl_distance = metric(
            search_string,
            i
        )
        indices_checked.append(int(index))
        dl_distances.append(dl_distance)
    df_index_dl_distance = (
        pd.DataFrame({
            'Index': indices_checked,
            'Normalized_similarity': dl_distances,
            'Type similarity': metric_name
        })
        .sort_values(by = 'Normalized_similarity', ascending = False)
        .reset_index(drop = True)
    )
    return df_index_dl_distance

def search_for_the_title(search_string: str, 
                         title_list: list, 
                         verbose = False
                         ):
    """
    Performs fuzzy search for the title in the specified
    column of a pandas dataframe. Returns the index at which
    the search_string was found.
    """
    # Edit-based similarity
    df_index_dl_distance_1 = get_df_norm_simil(
        search_string.lower(),
        [i.lower() for i in title_list],
        td.damerau_levenshtein.normalized_similarity,
        'Edit'
    )

    # Sequence-based similarity
    df_index_dl_distance_2 = get_df_norm_simil(
        search_string.lower(),
        [i.lower() for i in title_list],
        td.smith_waterman.normalized_similarity,
        'Sequence'
    )

    # Token-based similarity
    df_index_dl_distance_3 = get_df_norm_simil(
        search_string.lower().split(' '),
        [i.lower().split(' ') for i in title_list],
        td.jaccard.normalized_similarity,
        'Token'
    )

    # Concatenate dataframes
    df_index_dl_distance = (
        pd.concat(
            [
                df_index_dl_distance_1,
                df_index_dl_distance_2,
                df_index_dl_distance_3
            ],
            axis = 'rows',
            ignore_index = True
        )
        .sort_values(by = ['Type similarity', 'Normalized_similarity'], ascending = [True, False])
        .reset_index(drop = True)
    )
    
    df_index_dl_distance = (
        df_index_dl_distance
        # .groupby('Type similarity')
        .groupby('Index')
        # .apply(lambda x: x.nlargest(3, 'Normalized_similarity'))
        .agg({
            'Normalized_similarity': 'mean'
        })
        .reset_index()
        .sort_values(by = 'Normalized_similarity', ascending = False)
        .head(5)
    )
    df_index_dl_distance = df_index_dl_distance.rename(columns = {'Normalized_similarity': 'Mean normalized similarity'})
    df_index_dl_distance['Title'] = df_index_dl_distance['Index'].apply(lambda x: title_list[int(x)])
    df_index_dl_distance = df_index_dl_distance[['Index', 'Title', 'Mean normalized similarity']]

    highest_match        = df_index_dl_distance.sort_values(by = 'Mean normalized similarity', ascending = False).reset_index()['Index'].values.tolist()[0]

    # Verbose section
    if verbose:
        print(f'Search string: "{search_string}"')
        if search_string.lower() in df_index_dl_distance['Title'].str.lower().tolist():
            print("We found the exact match to your title!")
            print(df_index_dl_distance[df_index_dl_distance['Title'].str.lower() == search_string.lower()].to_markdown(index = False))
        else:
            print("We didn't find an exact match to your title.")
            print("Here are the most similar titles:")
            print(df_index_dl_distance.to_markdown(index = False))
            # for index, row in df_index_dl_distance.iterrows():
            #     title_by_index = title_list[int(row['Index'])]
            #     print(f"> '{title_by_index}' | index: {row['Index']} | similarity: {row['Normalized_similarity']}")
        print('\n\n')
    return highest_match

for i in [
    'The Avengers',
    'Avengers',
    'The avengers',
    'avengers',
    'spider',
    'avengers winter soldier'
]:
    search_for_the_title(
        i,
        df1['title'].tolist(),
        verbose = True
    )


Search string: "The Avengers"
We found the exact match to your title!
|   Index | Title        |   Mean normalized similarity |
|--------:|:-------------|-----------------------------:|
|      16 | The Avengers |                            1 |



Search string: "Avengers"
We didn't find an exact match to your title.
Here are the most similar titles:
|   Index | Title             |   Mean normalized similarity |
|--------:|:------------------|-----------------------------:|
|      16 | The Avengers      |                     0.722222 |
|    4668 | Swingers          |                     0.416667 |
|    4560 | The Toxic Avenger |                     0.367647 |
|    1321 | Texas Rangers     |                     0.362179 |
|    2838 | Rounders          |                     0.333333 |



Search string: "The avengers"
We found the exact match to your title!
|   Index | Title        |   Mean normalized similarity |
|--------:|:-------------|-----------------------------:|
|      16 | The Av

In [51]:
def get_top_n_similar_indices(title: str,
                              df: pd.DataFrame
                              ) -> None:
    """
    Based on an input of movie title (that has to be present in the training corpus),
    produce n = 5 recommendations movies. 
    """
    # Get index of the movie title
    a = get_index_of_movie_title(df, title)

    # Get similarity matrix of this movie's description to other descriptions
    cosine_similarity_matrix = []
    for i in tfidf_matrix_dense:
        idx = cosine_similarity([i], [tfidf_matrix_dense[a]])[0][0]
        cosine_similarity_matrix.append(float(idx))
    
    # Get top 5 indices with the highest similarity score
    ind = np.argpartition(cosine_similarity_matrix, -5)[-5:]
    indices, cosine_similarity_scores = [], []
    for i in ind[::-1]:
        indices.append(i)
        cosine_similarity_scores.append(cosine_similarity_matrix[i])

    return indices, cosine_similarity_scores

def format_recommendations(title: str,
                           training_data: pd.DataFrame,
                           indices_in_training_data: int,
                           cosine_similarity_scores: float,
                           format_print: str = 'list'
                           ) -> None:
    """
    Args:
        format_print: can be 'list', 'table'
    """
    if format_print == 'list':
        # Print the recommendations
        print(f"Top 5 words similar to '{title}':\n")
        for index, i in enumerate(indices_in_training_data):
            print(f"- index: {i}")
            print(f"- cosine similarity score: {cosine_similarity_scores[index]}")
            print(f" - recommended title: '{training_data.iloc[i]['title']}'")
            print(f" - tagline: {training_data.iloc[i]['tagline']}")
            print('\n')
    elif format_print == 'table':
        indices, cosine, title, tagline = [], [], [], []
        for index, i in enumerate(indices_in_training_data):
            indices.append(i)
            cosine.append(
                cosine_similarity_scores[index]
            )
            title.append(
                training_data.iloc[i]['title']
            )
            tagline.append(
                training_data.iloc[i]['tagline']
            )
        df_markdown = pd.DataFrame({
            'index': indices, 
            'cosine': cosine,
            'title': title, 
            'tagline': tagline
        })
        print(df_markdown.to_markdown(index = False))
        pass
    return None




def get_n_recommendations(title_search: str,
                          df: pd.DataFrame,
                          print_format: str = 'list'
                          ) -> None:
    """
    docstring.
    """
    # First, fuzzy search for this movie title in the training dataframe
    # title_found = search_for_the_title(title_search, df)
    title_index = search_for_the_title(
        title_search,
        df['title'].tolist(),
        verbose = True
    )
    title_found = df['title'].tolist()[title_index]
    # title_found = title_search
    
    ind, cosine_similarity_scores = get_top_n_similar_indices(title_found, df)
    print(f'Recommendations for the movie "{title_found}":\n')
    format_recommendations(title_found, df, ind, cosine_similarity_scores, print_format)
    print('\n')
    return None

get_n_recommendations('The Avengers',
                      df1, 
                      print_format = 'table'
                      )

Search string: "The Avengers"
We found the exact match to your title!
|   Index | Title        |   Mean normalized similarity |
|--------:|:-------------|-----------------------------:|
|      16 | The Avengers |                            1 |



Recommendations for the movie "The Avengers":

|   index |   cosine | title                           | tagline                                      |
|--------:|---------:|:--------------------------------|:---------------------------------------------|
|      16 | 1        | The Avengers                    | Some assembly required.                      |
|      91 | 0.153163 | Independence Day: Resurgence    | We had twenty years to prepare. So did they. |
|     256 | 0.12732  | Allegiant                       | Break the boundaries of your world           |
|       7 | 0.115689 | Avengers: Age of Ultron         | A New Age Has Come.                          |
|     588 | 0.109449 | Wall Street: Money Never Sleeps | Gordon never gives up.   

In [52]:
for i in (
    'Zombieland', 
    '28 Days Later',
    'Avengers'
):
    get_n_recommendations(i, df1, 'table')


Search string: "Zombieland"
We found the exact match to your title!
|   Index | Title      |   Mean normalized similarity |
|--------:|:-----------|-----------------------------:|
|    1988 | Zombieland |                            1 |



Recommendations for the movie "Zombieland":

|   index |   cosine | title            | tagline                                             |
|--------:|---------:|:-----------------|:----------------------------------------------------|
|    1988 | 1        | Zombieland       | This place is so dead                               |
|    2308 | 0.187876 | Land of the Dead | The dead shall inherit the Earth.                   |
|    1740 | 0.137952 | Kick-Ass 2       | You Can't Fight Your Destiny.                       |
|    2387 | 0.113387 | Day of the Dead  | The darkest day of horror the world has ever known. |
|    3559 | 0.109904 | Ouija            | Keep telling yourself it's just a game              |


Search string: "28 Days Later"
We found th