In [35]:
import sys
import os 
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import unicodedata
import json

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from recommender.text_cleaner import clean_text, title_to_index

In [36]:
movie_data = pd.read_csv("../data/raw/movie_metadata.csv")

## Inspecting and cleaning data 

In [37]:
movie_data.head(20)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


In [38]:
movie_data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [39]:
movie_data.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


In [40]:
data = movie_data[['movie_title', 'genres', 'plot_keywords', 'imdb_score', 'duration', 'gross', 'budget']]

In [41]:
data = data.drop_duplicates(subset=['movie_title'], keep='first')

In [42]:
data.head()

Unnamed: 0,movie_title,genres,plot_keywords,imdb_score,duration,gross,budget
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,7.9,178.0,760505847.0,237000000.0
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,7.1,169.0,309404152.0,300000000.0
2,Spectre,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,6.8,148.0,200074175.0,245000000.0
3,The Dark Knight Rises,Action|Thriller,deception|imprisonment|lawlessness|police offi...,8.5,164.0,448130642.0,250000000.0
4,Star Wars: Episode VII - The Force Awakens ...,Documentary,,7.1,,,


In [43]:
data.isna().sum()

movie_title        0
genres             0
plot_keywords    152
imdb_score         0
duration          15
gross            863
budget           484
dtype: int64

In [44]:
data = data.dropna(axis=0, subset=["plot_keywords"])

## Cleaning Numerical data 

In [45]:
scaler = StandardScaler()

In [46]:
numeric_cols = ['imdb_score', 'duration', 'gross', 'budget']

In [47]:
numeric_data = data[numeric_cols].fillna(0)

In [48]:
data[['imdb_score', 'duration', 'gross', 'duration']].isna().sum()

imdb_score      0
duration        8
gross         751
duration        8
dtype: int64

In [49]:
data[['imdb_score', 'duration']].head

<bound method NDFrame.head of       imdb_score  duration
0            7.9     178.0
1            7.1     169.0
2            6.8     148.0
3            8.5     164.0
5            6.6     132.0
...          ...       ...
5036         7.8      84.0
5037         6.4      95.0
5038         7.7      87.0
5039         7.5      43.0
5042         6.6      90.0

[4765 rows x 2 columns]>

In [50]:
numeric_scaled = scaler.fit_transform(numeric_data)

In [51]:
data.columns

Index(['movie_title', 'genres', 'plot_keywords', 'imdb_score', 'duration',
       'gross', 'budget'],
      dtype='object')

In [52]:
numeric_data.head()

Unnamed: 0,imdb_score,duration,gross,budget
0,7.9,178.0,760505847.0,237000000.0
1,7.1,169.0,309404152.0,300000000.0
2,6.8,148.0,200074175.0,245000000.0
3,8.5,164.0,448130642.0,250000000.0
5,6.6,132.0,73058679.0,263700000.0


In [53]:
data.head()

Unnamed: 0,movie_title,genres,plot_keywords,imdb_score,duration,gross,budget
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,7.9,178.0,760505847.0,237000000.0
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,7.1,169.0,309404152.0,300000000.0
2,Spectre,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,6.8,148.0,200074175.0,245000000.0
3,The Dark Knight Rises,Action|Thriller,deception|imprisonment|lawlessness|police offi...,8.5,164.0,448130642.0,250000000.0
5,John Carter,Action|Adventure|Sci-Fi,alien|american civil war|male nipple|mars|prin...,6.6,132.0,73058679.0,263700000.0


In [54]:
data.columns

Index(['movie_title', 'genres', 'plot_keywords', 'imdb_score', 'duration',
       'gross', 'budget'],
      dtype='object')

In [55]:
print(data['movie_title'].head(10))

0                                       Avatar 
1     Pirates of the Caribbean: At World's End 
2                                      Spectre 
3                        The Dark Knight Rises 
5                                  John Carter 
6                                 Spider-Man 3 
7                                      Tangled 
8                      Avengers: Age of Ultron 
9       Harry Potter and the Half-Blood Prince 
10          Batman v Superman: Dawn of Justice 
Name: movie_title, dtype: object


## Faiss embedding data

In [56]:
data['clean_genres'] = data['genres'].apply(clean_text)

data['clean_keywords'] = data['plot_keywords'].apply(clean_text)

In [57]:

data["embedding_data"] = (
    data['movie_title'].str.lower() + " " +
    data['clean_keywords'] + " genre: " +
    data["clean_genres"]
)

## Encoding genre data

In [58]:
data["genre_list"] = data["genres"].apply(lambda x: x.split("|"))

In [59]:
mlb = MultiLabelBinarizer()

In [60]:
encoded_data = mlb.fit_transform(data['genre_list'])

In [61]:
encoded_data = pd.DataFrame(encoded_data, columns=mlb.classes_)

In [62]:
encoded_data

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4760,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4761,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4762,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4763,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0


## Indexed title

In [63]:
indexed_title = title_to_index(data['movie_title'])

print(indexed_title)

{'Avatar ': 0, "Pirates of the Caribbean: At World's End ": 1, 'Spectre ': 2, 'The Dark Knight Rises ': 3, 'John Carter ': 4, 'Spider-Man 3 ': 5, 'Tangled ': 6, 'Avengers: Age of Ultron ': 7, 'Harry Potter and the Half-Blood Prince ': 8, 'Batman v Superman: Dawn of Justice ': 9, 'Superman Returns ': 10, 'Quantum of Solace ': 11, "Pirates of the Caribbean: Dead Man's Chest ": 12, 'The Lone Ranger ': 13, 'Man of Steel ': 14, 'The Chronicles of Narnia: Prince Caspian ': 15, 'The Avengers ': 16, 'Pirates of the Caribbean: On Stranger Tides ': 17, 'Men in Black 3 ': 18, 'The Hobbit: The Battle of the Five Armies ': 19, 'The Amazing Spider-Man ': 20, 'Robin Hood ': 21, 'The Hobbit: The Desolation of Smaug ': 22, 'The Golden Compass ': 23, 'King Kong ': 24, 'Titanic ': 25, 'Captain America: Civil War ': 26, 'Battleship ': 27, 'Jurassic World ': 28, 'Skyfall ': 29, 'Spider-Man 2 ': 30, 'Iron Man 3 ': 31, 'Alice in Wonderland ': 32, 'X-Men: The Last Stand ': 33, 'Monsters University ': 34, 'Tra

In [64]:
cleaned_indexed_title = {
    unicodedata.normalize("NFKD", title): index
    for title, index in indexed_title.items()
}

In [65]:
print(cleaned_indexed_title)

{'Avatar ': 0, "Pirates of the Caribbean: At World's End ": 1, 'Spectre ': 2, 'The Dark Knight Rises ': 3, 'John Carter ': 4, 'Spider-Man 3 ': 5, 'Tangled ': 6, 'Avengers: Age of Ultron ': 7, 'Harry Potter and the Half-Blood Prince ': 8, 'Batman v Superman: Dawn of Justice ': 9, 'Superman Returns ': 10, 'Quantum of Solace ': 11, "Pirates of the Caribbean: Dead Man's Chest ": 12, 'The Lone Ranger ': 13, 'Man of Steel ': 14, 'The Chronicles of Narnia: Prince Caspian ': 15, 'The Avengers ': 16, 'Pirates of the Caribbean: On Stranger Tides ': 17, 'Men in Black 3 ': 18, 'The Hobbit: The Battle of the Five Armies ': 19, 'The Amazing Spider-Man ': 20, 'Robin Hood ': 21, 'The Hobbit: The Desolation of Smaug ': 22, 'The Golden Compass ': 23, 'King Kong ': 24, 'Titanic ': 25, 'Captain America: Civil War ': 26, 'Battleship ': 27, 'Jurassic World ': 28, 'Skyfall ': 29, 'Spider-Man 2 ': 30, 'Iron Man 3 ': 31, 'Alice in Wonderland ': 32, 'X-Men: The Last Stand ': 33, 'Monsters University ': 34, 'Tra

In [66]:
os.makedirs("../data", exist_ok=True)

with open("../data/index/title_to_index.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_indexed_title, f, indent=4, ensure_ascii=False)

## Artifacts

In [67]:
data.to_csv("../data/raw/cleaned_data.csv", index=False)
encoded_data.to_csv("../data/processed/encoded_data.csv", index=False)
np.save("../data/processed/numeric_scaled.npy", numeric_scaled)