# Recommender System with Content Based Filtering

## Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from warnings import filterwarnings
filterwarnings('ignore')

from jcopml.utils import save_model, load_model

## Load Data

In [2]:
df = pd.read_csv("data/content_based_movie.csv")
df.head()

Unnamed: 0,title,genres,cast,keywords,director,overview,metadata
0,toy story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,"Led by Woody, Andy's toys live happily in his ...","Toy Story Led by Woody, Andy's toys live happi..."
1,jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,When siblings Judy and Peter discover an encha...,Jumanji When siblings Judy and Peter discover ...
2,grumpier old men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,A family wedding reignites the ancient feud be...,Grumpier Old Men A family wedding reignites th...
3,waiting to exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,"Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale Cheated on, mistreated and s..."
4,father of the bride part ii,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,Just when George Banks has recovered from his ...,Father of the Bride Part II Just when George B...


In [3]:
df.shape

(41362, 7)

In [4]:
df.dtypes

title       object
genres      object
cast        object
keywords    object
director    object
overview    object
metadata    object
dtype: object

In [5]:
df.isna().sum()

title           0
genres       2031
cast         2194
keywords    12731
director      797
overview        0
metadata        0
dtype: int64

## Feature Extraction

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

stopwords = []
f = open('data/stopwords.txt', 'r')
for i in f.readlines():
    stopwords.append(i.replace('\n', ''))
sw_indo = stopwords + list(punctuation)

In [7]:
tfidf = TfidfVectorizer(stop_words=sw_indo, tokenizer=word_tokenize, min_df=2)
features = tfidf.fit_transform(df.metadata)

# Save Tfidf and Features

In [8]:
save_model(tfidf, "tfidf.pkl")
save_model(features, "features.pkl")

Model is pickled as model/tfidf.pkl
Model is pickled as model/features.pkl


# Step 1 : Encode what user watch 

In [9]:
title = 'Toy Story'

In [10]:
idx = list(df[df.title == title.lower()].index)

content = df.loc[idx[0], 'metadata']
content

"Toy Story Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. animation comedy family tom_hanks tim_allen don_rickles jealousy toy boy john_lasseter"

In [11]:
code = tfidf.transform([content])

# Step 2 : Document Similarity

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
dist = cosine_similarity(code, features)
dist

array([[1.        , 0.03071134, 0.01113642, ..., 0.        , 0.01399582,
        0.        ]])

In [14]:
rec_idx = (-dist).argsort()[0, 1:11]
rec_idx

array([ 2945, 14706,  9984, 22414, 23024, 33726, 39614, 35241, 24068,
       38993], dtype=int64)

# Step 3 : Recommend 

In [15]:
df.loc[rec_idx]

Unnamed: 0,title,genres,cast,keywords,director,overview,metadata
2945,toy story 2,animation comedy family,tom_hanks tim_allen joan_cusack,museum prosecution identity_crisis,john_lasseter,"Andy heads off to Cowboy Camp, leaving his toy...","Toy Story 2 Andy heads off to Cowboy Camp, lea..."
14706,toy story 3,animation family comedy,tom_hanks tim_allen ned_beatty,hostage college toy,lee_unkrich,"Woody, Buzz, and the rest of Andy's toys haven...","Toy Story 3 Woody, Buzz, and the rest of Andy'..."
9984,the 40 year old virgin,comedy romance,steve_carell catherine_keener paul_rudd,first_time virgin,judd_apatow,Andy Stitzer has a pleasant life with a nice a...,The 40 Year Old Virgin Andy Stitzer has a plea...
22414,andy hardy's blonde trouble,comedy family romance,azura_skye annie_heller carla_gallo,independent_film,ken_kwapis,Andy is going to Wainwright College as did his...,Andy Hardy's Blonde Trouble Andy is going to W...
23024,small fry,animation family,,soul fight trap,shane_acker,A fast food restaurant mini variant of Buzz fo...,Small Fry A fast food restaurant mini variant ...
33726,toy reanimator,science_fiction fantasy,saeed_jaffrey kareem_samar zohra_sehgal,,ismail_merchant,Set in a toy store specialized in antique toys...,Toy Reanimator Set in a toy store specialized ...
39614,andy kaufman plays carnegie hall,,nanni_moretti laura_morante roberto_vezzosi,germany political_satire neo-nazis,nanni_moretti,Andy Kaufman's legendary sold-out Carnegie Hal...,Andy Kaufman Plays Carnegie Hall Andy Kaufman'...
35241,superstar: the life and times of andy warhol,documentary,qiang_chen xin-gang_wang mei_xiang,,xie_jin,Documentary portrait of Andy Warhol.,Superstar: The Life and Times of Andy Warhol D...
24068,"silent night, deadly night 5: the toy maker",horror science_fiction,josé_elías_moreno cesáreo_quezadas josé_luis_a...,mexico santa_claus children,rené_cardona,A young boy sees his father killed by a toy th...,"Silent Night, Deadly Night 5: The Toy Maker A ..."
38993,andy peters: exclamation mark question point,comedy,tōru_furuya hirotaka_suzuoki kōichi_hashimoto,middle_east drug_dealer family_relationships,shigeyasu_yamauchi,Exclamation Mark Question Point is the debut s...,Andy Peters: Exclamation Mark Question Point E...


# Bungkus code pemodelan : ML Engineering

In [16]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')
from jcopml.utils import load_model
from sklearn.metrics.pairwise import cosine_similarity

# load model 
features = load_model('model/features.pkl')
tfidf = load_model('model/tfidf.pkl')

class RecommenderSystem: 
    def __init__(self, data, encoder, features, content_col):
        self.df = pd.read_csv(data)
        self.content_col = content_col
        self.encoder = encoder
        self.features = features
        
    def recommend(self, title, topk=10):
        idx = list(self.df[self.df.title == title.lower()].index)
        content = self.df.loc[idx[0], self.content_col]
        code = self.encoder.transform([content])
        
        dist = cosine_similarity(code, self.features)
        rec_idx = (-dist).argsort()[0, 1:11]
        return self.df.loc[rec_idx, 'title':'genres']

In [17]:
recsys = RecommenderSystem('data/content_based_movie.csv', tfidf, features, 'metadata')

In [18]:
recsys.recommend(title='aladin')

Unnamed: 0,title,genres
29345,aladdin and the death lamp,tv_movie adventure fantasy
4782,jimmy neutron: boy genius,action adventure animation
20182,blue jasmine,comedy drama
35006,aladdin and his magic lamp,adventure family romance
35451,platinum the dance movie,romance music comedy
14316,lupin the third: sweet lost night,action animation comedy
1470,wedding bell blues,comedy romance
1959,the return of jafar,family adventure animation
29943,16 wishes,fantasy drama family
12894,comet in moominland,animation comedy fantasy
