## Install/Import Libraries


In [None]:
!pip install scikit-learn



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Term Frequency-Inverse Document Frequency (TF-IDF), a feature extraction used in Natural Language Processing (NLP)
# to evaluate how important a word is within the corpus.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Use Pandas load movie CSV
movies = pd.read_csv('https://data.neo4j.com/intro/movies/movies.csv')

In [None]:
# Display head rows
movies.head()

Unnamed: 0,title,released,tagline
0,The Matrix,1999,Welcome to the Real World
1,Something's Gotta Give,1975,
2,Ninja Assassin,2009,Prepare to enter a secret world of assassins
3,The Matrix Reloaded,2003,Free your mind
4,Stand By Me,1995,"For some, it's the last real taste of innocenc..."


In [None]:
# Display movie size
print(movies.size)

114


In [None]:
# Convert title and tag line to string value
movies['title'] = movies['title'].fillna("").astype('str')
movies['tagline'] = movies['tagline'].fillna("").astype('str')
movies.head()

# Set up title array
titles = movies['title']

<class 'pandas.core.series.Series'>


# Recommend Movie Based on Title

In [None]:
# Convert the title in 2-gram words excluding the stopwords, for example 'a', 'and', 'the', and etc.
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')

tfidf_matrix = tf.fit_transform(titles)
tfidf_matrix.shape

# Print out the first movie
#print(tfidf_matrix)
print(movies.loc[:].values[0][0])

The Matrix


In [None]:
![Cosine Similarity]("https://neo4j.com/docs/graph-data-science/current/_images/cosine-similarity.png")
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://neo4j.com/docs/graph-data-science/current/_images/cosine-similarity.png")

/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `[Cosine Similarity]("https://neo4j.com/docs/graph-data-science/current/_images/cosine-similarity.png")'


In [None]:
# Calculate Cosine similarity score from vector
cosine_similarity_mx = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_similarity_mx[:8, :8]
print(cosine_similarity_mx[0])

[1.         0.         0.         0.50408245 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.50408245 0.         0.         0.         0.         0.
 0.         0.        ]


In [None]:
# Build a one-dimension Pandas series with movie titles
indices = pd.Series(movies.index, index=titles)
#print(indices)

# Function that get movie recommendations based on the cosine similarity score of movie genres
def recommend_movie_by_title(title, number_of_movie):
    idx = indices[title]
    #print(str(cosine_similarity_mx[idx].size))
    # Get Cosine similarity score from numpy into a list
    # [(0, 1.0), (1, 0.0), (2, 0.0), (3, 0.5040824532793767), (30, 0.5040824532793767), ...]
    similarity_scores = list(enumerate(cosine_similarity_mx[idx]))

    # Sort the score (ss[1]) in reversed order
    # [(0, 1.0)
    # (3, 0.5040824532793767)
    # (30, 0.5040824532793767) ...]
    similarity_scores = sorted(similarity_scores, key=lambda ss: ss[1], reverse=True)

    # The top one is the movie itself, remove it
    # [
    # (3, 0.5040824532793767)
    # (30, 0.5040824532793767) ...]
    similarity_scores = similarity_scores[1:number_of_movie]

    # Get the movie indeces (ss[0]) from similarity score
    movie_indices = [ss[0] for ss in similarity_scores]
    return titles.iloc[movie_indices]

In [None]:
recommend_movie_by_title('The Matrix', 5).head(5)

3        The Matrix Reloaded
30    The Matrix Revolutions
1     Something's Gotta Give
2             Ninja Assassin
Name: title, dtype: object