In [1]:
# ! pip3 install pandas
# ! pip3 install scikit-learn
# ! pip3 install nltk
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [2]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load Data

In [3]:
df_movies_metadata = pd.read_csv('movies_metadata.csv')
print(df_movies_metadata.shape)
df_movies_metadata.head(5)

(45466, 24)


  df_movies_metadata = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
df_id_overview = df_movies_metadata[['id','overview']]
print(df_id_overview.shape)
df_id_overview.head(5)

(45466, 2)


Unnamed: 0,id,overview
0,862,"Led by Woody, Andy's toys live happily in his ..."
1,8844,When siblings Judy and Peter discover an encha...
2,15602,A family wedding reignites the ancient feud be...
3,31357,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Just when George Banks has recovered from his ...


In [5]:
np.sum(df_id_overview.isna())

id            0
overview    954
dtype: int64

In [6]:
# define parameters
id_col = 'id'
text_col = 'overview'

# Data Preprocessing

In [7]:
# fill na
df_id_overview = df_id_overview.fillna("")
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"Led by Woody, Andy's toys live happily in his ..."
1,8844,When siblings Judy and Peter discover an encha...


In [8]:
# to lowercase
df_id_overview[text_col] = df_id_overview[text_col].str.lower()
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [9]:
# remove numbers
df_id_overview[text_col] = df_id_overview[text_col].astype(str).str.replace(r"\d+", " ")
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [10]:
# remove punctuation
df_id_overview[text_col] = df_id_overview[text_col].str.replace(
                    "[{}]".format(string.punctuation), " "
                )
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [11]:
# remove whitespaces
df_id_overview[text_col] = df_id_overview[text_col].str.strip()
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [12]:
# remove non_ascii characters
df_id_overview[text_col] = df_id_overview[text_col].apply(
                    lambda x: x.encode("ascii", errors="ignore").decode()
                )
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [13]:
# remove html characters
df_id_overview[text_col] = df_id_overview[text_col].str.replace(r"<[^<>]*>", "", regex=True)
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led by woody, andy's toys live happily in his ..."
1,8844,when siblings judy and peter discover an encha...


In [14]:
# tokenization
df_id_overview[text_col] = df_id_overview[text_col].apply(word_tokenize)
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"[led, by, woody, ,, andy, 's, toys, live, happ..."
1,8844,"[when, siblings, judy, and, peter, discover, a..."


In [15]:
# remove stop words
stop_words = set(stopwords.words("english"))
df_id_overview[text_col] = df_id_overview[text_col].apply(
                    lambda tokens: [
                        token for token in tokens if token not in stop_words
                    ]
                )
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"[led, woody, ,, andy, 's, toys, live, happily,..."
1,8844,"[siblings, judy, peter, discover, enchanted, b..."


In [16]:
# stemming
stemmer = PorterStemmer()
df_id_overview[text_col] = df_id_overview[text_col].apply(
                    lambda tokens: [stemmer.stem(token) for token in tokens]
                )
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"[led, woodi, ,, andi, 's, toy, live, happili, ..."
1,8844,"[sibl, judi, peter, discov, enchant, board, ga..."


In [17]:
# rejoin
df_id_overview[text_col] = df_id_overview[text_col].str.join(" ")
df_id_overview.head(2)

Unnamed: 0,id,overview
0,862,"led woodi , andi 's toy live happili room andi..."
1,8844,sibl judi peter discov enchant board game open...


# Feature Engineering

In [18]:
vectorizer = TfidfVectorizer(max_features=50).fit(df_id_overview[text_col])
vectors = vectorizer.transform(df_id_overview[text_col])

In [19]:
vectors.toarray().shape

(45466, 50)

In [20]:
vectors.toarray()[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
type(vectors)

scipy.sparse._csr.csr_matrix

# Calculate Similarity

In [22]:
id_list = df_id_overview[id_col]
id_list

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45466, dtype: object

In [23]:
temp = pd.DataFrame(vectors.toarray())
print(temp.shape)
temp.head(2)

(45466, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.522867,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45136,0.419636,0.0


In [24]:
temp=temp[:100]
temp.shape

(100, 50)

In [25]:
id_list[:100]

0       862
1      8844
2     15602
3     31357
4     11862
      ...  
95      406
96    45549
97    63076
98    11062
99    13685
Name: id, Length: 100, dtype: object

In [26]:
similarity_score = pd.DataFrame(cosine_similarity(temp, temp),columns = id_list[:100],index=id_list[:100])

In [27]:
similarity_score

id,862,8844,15602,31357,11862,949,11860,45325,9091,710,...,9095,12158,9283,9208,40154,406,45549,63076,11062,13685
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,1.000000,0.426612,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.510688,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
8844,0.426612,1.000000,0.000000,0.181669,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.217866,0.0,0.000000,0.000000,0.0,0.0,0.148617,0.0,0.000000
15602,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.0,0.192268,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.296966,0.0,0.000000
31357,0.000000,0.181669,0.000000,1.000000,0.0,0.476905,0.0,0.138913,0.000000,0.0,...,0.000000,0.350967,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.499075
11862,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.0,0.000000,0.391583,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.0,0.000000
45549,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.168500,0.000000,0.0,...,0.357993,0.000000,0.0,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000
63076,0.000000,0.148617,0.296966,0.000000,0.0,0.000000,0.0,0.086432,0.000000,0.0,...,0.000000,0.000000,0.0,0.134378,0.167268,0.0,0.0,1.000000,0.0,0.000000
11062,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,1.0,0.000000


# Make Recommendation

In [28]:
item = ['862']
k=5

# filter the target item
target_item_df = similarity_score[item]
target_item_df

id,862
id,Unnamed: 1_level_1
862,1.000000
8844,0.426612
15602,0.000000
31357,0.000000
11862,0.000000
...,...
406,0.000000
45549,0.000000
63076,0.000000
11062,0.000000


In [29]:
# remove target item
target_item_df = target_item_df[target_item_df.index != item[0]]
target_item_df

id,862
id,Unnamed: 1_level_1
8844,0.426612
15602,0.000000
31357,0.000000
11862,0.000000
949,0.000000
...,...
406,0.000000
45549,0.000000
63076,0.000000
11062,0.000000


In [30]:
# get top k recommendations
result = target_item_df.sort_values(item, ascending=False)[:k]
result

id,862
id,Unnamed: 1_level_1
4482,1.0
124626,0.768841
10607,0.717893
9536,0.51293
12158,0.510688


In [31]:
# validation 
print(df_movies_metadata[df_movies_metadata['id']=='862'][['original_title','overview']])

for i in result.index:
    print(df_movies_metadata[df_movies_metadata['id']==i][['original_title','overview']])

  original_title                                           overview
0      Toy Story  Led by Woody, Andy's toys live happily in his ...
   original_title                                           overview
67   Gazon maudit  After learning of her husband's infidelities, ...
   original_title                                           overview
76      Nico Icon  A look into the many lives of Christa Päffgen,...
                                       original_title  \
62  Don't Be a Menace to South Central While Drink...   

                                             overview  
62  When Ashtray (Shawn Wayans) moves to South Cen...  
   original_title                                           overview
64       Bio-Dome  Bud and Doyle are two losers who are doing not...
         original_title                                           overview
91  Vampire in Brooklyn  Maximillian, the lone survivor of a race of va...


# Iteration

In [32]:
# improve preprocessing

In [33]:
# improve feature engineering

In [34]:
# improve modeling