In [46]:
# Mengimpor modul kagglehub untuk mengunduh dataset dari Kaggle
import kagglehub

# Mengunduh dataset terbaru "The Movies Dataset" dari Kaggle
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")

# Menampilkan jalur tempat dataset disimpan
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7


## Data Understanding


In [47]:
# Mengimpor pandas untuk pemrosesan data dan beberapa modul dari scikit-learn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Membaca dataset 'movies_metadata.csv' dari direktori yang sudah diunduh
df = pd.read_csv('/root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7/movies_metadata.csv')

# Melihat 5 baris pertama dari dataset
df.head()

  df = pd.read_csv('/root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [48]:
# Menampilkan informasi dataset, termasuk jumlah kolom dan tipe data masing-masing kolom
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [49]:
# Menampilkan statistik deskriptif dari dataset, dengan perhitungan persentil tambahan
df.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T

Unnamed: 0,count,mean,std,min,0%,5%,50%,95%,99%,100%,max
revenue,45460.0,11209350.0,64332250.0,0.0,0.0,0.0,0.0,47808920.0,273087600.0,2787965000.0,2787965000.0
runtime,45203.0,94.1282,38.40781,0.0,0.0,11.0,95.0,138.0,185.0,1256.0,1256.0
vote_average,45460.0,5.618207,1.924216,0.0,0.0,0.0,6.0,7.8,8.7,10.0,10.0
vote_count,45460.0,109.8973,491.3104,0.0,0.0,0.0,10.0,434.0,2183.82,14075.0,14075.0


In [50]:
# Memeriksa jumlah nilai yang hilang (missing values) di setiap kolom dataset
df.isnull().sum()

Unnamed: 0,0
adult,0
belongs_to_collection,40972
budget,0
genres,0
homepage,37684
id,0
imdb_id,17
original_language,11
original_title,0
overview,954


## Data Preparation

In [51]:
# Mengisi nilai yang hilang pada kolom 'overview' dengan string kosong
df['overview'] = df['overview'].fillna('')

# Memastikan bahwa tidak ada lagi nilai yang hilang di kolom 'overview'
df['overview'].isnull().sum()

0

In [52]:
# Membuat objek TF-IDF Vectorizer dan menghapus stop words bahasa Inggris
tfidf = TfidfVectorizer(stop_words="english")

# Mengubah kolom 'overview' menjadi vektor numerik menggunakan TF-IDF
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Menampilkan dimensi matriks TF-IDF yang dihasilkan (baris x kolom)
tfidf_matrix.shape

(45466, 75827)

In [53]:
# Memeriksa ukuran dari kolom judul film untuk dibandingkan dengan ukuran matriks TF-IDF
df['title'].shape

(45466,)

In [54]:
# Mengonversi matriks TF-IDF menjadi array (tidak selalu diperlukan, hanya untuk melihat isi matriksnya)
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [55]:
# Menghitung cosine similarity antara vektor TF-IDF untuk menemukan kesamaan antar film
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Menampilkan dimensi matriks cosine similarity yang dihasilkan
cosine_sim.shape

(45466, 45466)

In [56]:
# Menampilkan kesamaan film pada indeks ke-1 dengan semua film lainnya
cosine_sim[1]

array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411])

In [57]:
# Membuat Series di mana indeks adalah judul film dan nilai adalah indeks baris film
indices = pd.Series(df.index, index=df['title'])

# Menampilkan jumlah judul film yang duplikat
indices.index.value_counts()

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Cinderella,11
Alice in Wonderland,9
Hamlet,9
Les Misérables,8
Beauty and the Beast,8
...,...
Babies,1
The Green Room,1
Captain Conan,1
Like It Is,1


In [58]:
# Menampilkan indeks dari film berjudul "Cinderella" (sebelum menghapus duplikat)
indices["Cinderella"]

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Cinderella,993
Cinderella,13076
Cinderella,23507
Cinderella,23518
Cinderella,28391
Cinderella,28664
Cinderella,34254
Cinderella,35593
Cinderella,35595
Cinderella,41074


In [59]:
# Menghapus judul film yang duplikat, hanya menyimpan yang terakhir
indices = indices[~indices.index.duplicated(keep='last')]

# Menampilkan indeks dari "Cinderella" setelah duplikat dihapus
indices["Cinderella"]

45406

In [60]:
# Mengambil indeks film "Cinderella"
movie_index = indices["Cinderella"]

# Menampilkan skor kesamaan film "Cinderella" dengan semua film lainnya
cosine_sim[movie_index]

array([0.        , 0.        , 0.        , ..., 0.        , 0.00792129,
       0.        ])

## Modeling and Result

In [61]:
# Membuat DataFrame yang berisi skor kesamaan antara "Cinderella" dan film lainnya
similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])

# Mengambil 10 film teratas yang mirip dengan "Cinderella", selain "Cinderella" itu sendiri
movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

# Menampilkan judul film yang mirip dengan "Cinderella"
df['title'].iloc[movie_indices]

Unnamed: 0,title
33895,Freetown
34689,Come Tomorrow...
14606,Piter FM
15749,Comrade X
43813,Two Days
9555,Kamikaze Girls
24588,Chatterbox
41859,Free Floating
10604,Cuban Rafters
43069,Leave to Remain


In [62]:
# Membuat fungsi untuk merekomendasikan film berdasarkan konten (overview)
def content_based_recommender(title, cosine_sim, dataframe):
    # Membuat indeks untuk setiap judul film
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    # Menghapus judul yang duplikat
    indices = indices[~indices.index.duplicated(keep='last')]
    # Mendapatkan indeks film yang sesuai dengan judul yang diberikan
    movie_index = indices[title]
    # Menghitung skor kesamaan film berdasarkan judul
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])
    # Mengambil 10 film teratas yang mirip, kecuali film itu sendiri
    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

In [63]:
# Mencoba fungsi rekomendasi untuk film "Minions"
content_based_recommender("Minions", cosine_sim, df)

Unnamed: 0,title
21161,Despicable Me 2
10558,A Story of Floating Weeds
18440,"What's Up, Scarlet?"
31251,The Mother Of Invention
16176,Madam Satan
32252,The Invisible Boy
308,Stuart Saves His Family
45136,Banana
5693,Soul Assassin
8046,Sherlock Holmes and the Secret Weapon


In [64]:
# Mencoba fungsi rekomendasi untuk film "Family"
content_based_recommender("Family", cosine_sim, df)

Unnamed: 0,title
36203,Albela
27531,En rachâchant
4408,Rocket Gibraltar
27717,Finisterrae
27567,Snowballs
2966,All About My Mother
16962,Arthur
44549,You'll Never Be Alone
40964,What a Wonderful Family!
30509,Walking the Camino: Six Ways to Santiago


## Conclusion

Kode ini berhasil mengimplementasikan sistem rekomendasi film berbasis konten menggunakan vektorisasi TF-IDF pada "overview" film. Dengan menghitung cosine similarity, sistem dapat menemukan 10 film teratas yang paling mirip berdasarkan konten teks "overview" film. Judul yang duplikat ditangani agar rekomendasi tetap unik, dan sistem ini dapat digunakan untuk merekomendasikan film apapun dalam dataset. Pendekatan ini efektif untuk rekomendasi berbasis konten, namun bisa ditingkatkan lebih lanjut dengan menambahkan fitur lain seperti genre atau rating pengguna.