# Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split

# Load Data

In [2]:
df = pd.read_csv("data/content_based_movie.csv")
df.head()

Unnamed: 0,title,genres,cast,keywords,director,overview,metadata
0,toy story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,"Led by Woody, Andy's toys live happily in his ...","Toy Story Led by Woody, Andy's toys live happi..."
1,jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,When siblings Judy and Peter discover an encha...,Jumanji When siblings Judy and Peter discover ...
2,grumpier old men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,A family wedding reignites the ancient feud be...,Grumpier Old Men A family wedding reignites th...
3,waiting to exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,"Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale Cheated on, mistreated and s..."
4,father of the bride part ii,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,Just when George Banks has recovered from his ...,Father of the Bride Part II Just when George B...


In [3]:
df.shape

(41362, 7)

In [4]:
df.dtypes

title       object
genres      object
cast        object
keywords    object
director    object
overview    object
metadata    object
dtype: object

In [5]:
df.isna().sum()

title           0
genres       2031
cast         2194
keywords    12731
director      797
overview        0
metadata        0
dtype: int64

In [6]:
df.metadata[0]

"Toy Story Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. animation comedy family tom_hanks tim_allen don_rickles jealousy toy boy john_lasseter"

# Cleaning

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [8]:
def cleansing(text):
    text = text.lower()
    word_list = word_tokenize(text)
    word_list = [word for word in word_list if len(word) > 1 and word.isalpha()]
    text = ' '.join(word_list)
    return text

In [9]:
df['metadata'] = df.metadata.apply(cleansing)

In [10]:
df.metadata[0]

'toy story led by woody andy toys live happily in his room until andy birthday brings buzz lightyear onto the scene afraid of losing his place in andy heart woody plots against buzz but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences animation comedy family jealousy toy boy'

In [11]:
df.head()

Unnamed: 0,title,genres,cast,keywords,director,overview,metadata
0,toy story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,"Led by Woody, Andy's toys live happily in his ...",toy story led by woody andy toys live happily ...
1,jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,When siblings Judy and Peter discover an encha...,jumanji when siblings judy and peter discover ...
2,grumpier old men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,A family wedding reignites the ancient feud be...,grumpier old men family wedding reignites the ...
3,waiting to exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,"Cheated on, mistreated and stepped on, the wom...",waiting to exhale cheated on mistreated and st...
4,father of the bride part ii,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,Just when George Banks has recovered from his ...,father of the bride part ii just when george b...


# Create Stopwords

In [12]:
bow = CountVectorizer()

In [13]:
bow_matrix = bow.fit_transform(df.metadata)

In [14]:
term = bow.get_feature_names()

sums = bow_matrix.sum(axis=0)

data = []

for col, term in enumerate(term):
    data.append((term, sums[0, col]))

ranking = pd.DataFrame(data, columns=['term', 'frekuensi'])
ranking.head()

Unnamed: 0,term,frekuensi
0,aa,5
1,aaa,1
2,aaaaaaaah,1
3,aaaron,1
4,aachan,4


In [15]:
stopwords = ranking.sort_values('frekuensi', ascending=False).head(500)
stopwords

Unnamed: 0,term,frekuensi
67487,the,141970
2260,and,71709
47627,of,69564
68176,to,69315
31812,in,46443
...,...,...
29511,hero,522
63562,sport,520
69388,trouble,517
26626,gold,516


In [16]:
stopwords.to_excel('data/stopwords.xlsx', index=False)