In [1]:
import pandas as pd
import re

In [2]:
CATEGORIES = ["action", "adventure", "animation", "biography", "comedy", "crime", "documentary", "drama", "family",
              "fantasy", "film_noir", "history", "horror", "music", "musical", "mystery", "romance", "sci_fi",
              "short", "sport", "superhero", "thriller", "war", "western"]

In [3]:
dataframes = []

for genre in CATEGORIES:
    df = pd.read_csv(f"./data/{genre}_crawled_data.csv")
    dataframes.append(df)

df = pd.concat(dataframes)
df.head()

Unnamed: 0,genre,title,poster_link,description,labels
0,action,House of the Dragon,https://m.media-amazon.com/images/M/MV5BZjBiOG...,An internal succession war within House Targar...,Action Adventure Drama
1,action,The Lord of the Rings: The Rings of Power,https://m.media-amazon.com/images/M/MV5BNTg3Nj...,Epic drama set thousands of years before the e...,Action Adventure Drama
2,action,Black Adam,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ...",Action Adventure Fantasy
3,action,Andor,https://m.media-amazon.com/images/M/MV5BNDgxNT...,Prequel series to Star Wars' 'Rogue One'. In a...,Action Adventure Drama
4,action,The School for Good and Evil,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha find themselves...,Action Comedy Drama


In [4]:
len(df)

7200

### Convert labels string to one hot encoding

Add superhero label to labels (because IMDB guys to crazy)

In [5]:
labels = df.labels.values
genres = df.genre.values

df.labels = labels + " " + genres

Squash repeated films

In [6]:
df_agg = df.groupby("title").aggregate({"genre": 'first', "description": 'first', "labels": " ".join}).reset_index()
df_agg.head()

Unnamed: 0,title,genre,description,labels
0,'71,war,"In 1971, a young and disoriented British soldi...",Action Crime Drama war
1,'83,sport,"On June 25, 1983, the Lord's Cricket Ground wi...",Biography Drama History sport
2,'Allo 'Allo!,history,"In France during World War II, René Artois run...",Comedy History War history Comedy History War war
3,10 Cloverfield Lane,sci_fi,A young woman is held in an underground bunker...,Drama Horror Mystery sci_fi
4,10 Things I Hate About You,comedy,"A pretty, popular teenager can't go out on a d...",Comedy Drama Romance comedy Comedy Drama Roman...


Convert labels to one got encoding

In [7]:
set_of_all_labels = set()

for item in df.labels.apply(lambda x: list(x.split())):
    set_of_all_labels.update(item)

print(len(set_of_all_labels))
set_of_all_labels

51


{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western'}

In [8]:
def convert_string_to_genre(string):
    return string.lower().replace("-", "_")

In [9]:
all_labels = [convert_string_to_genre(item) for item in set_of_all_labels]
all_labels.extend(df.genre.unique())
all_labels

['horror',
 'sci_fi',
 'western',
 'sci_fi',
 'biography',
 'history',
 'reality_tv',
 'news',
 'superhero',
 'crime',
 'animation',
 'sport',
 'short',
 'war',
 'music',
 'horror',
 'film_noir',
 'drama',
 'mystery',
 'music',
 'comedy',
 'documentary',
 'drama',
 'fantasy',
 'documentary',
 'romance',
 'action',
 'romance',
 'action',
 'family',
 'musical',
 'thriller',
 'war',
 'game_show',
 'comedy',
 'adventure',
 'film_noir',
 'adventure',
 'crime',
 'sport',
 'biography',
 'mystery',
 'western',
 'animation',
 'thriller',
 'family',
 'fantasy',
 'musical',
 'talk_show',
 'short',
 'history',
 'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western']

In [10]:
existed_labels = list(set(CATEGORIES).intersection(all_labels))
existed_labels.sort()
print(len(existed_labels))
existed_labels

24


['action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western']

In [11]:
one_hots = []

for item in df_agg.labels:
    one_hot = [1 if genre in convert_string_to_genre(item) else 0 for genre in existed_labels]
    one_hots.append(one_hot)

In [12]:
df = df_agg

In [13]:
df[existed_labels] = one_hots

In [14]:
df.head()

Unnamed: 0,title,genre,description,labels,action,adventure,animation,biography,comedy,crime,...,musical,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western
0,'71,war,"In 1971, a young and disoriented British soldi...",Action Crime Drama war,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,'83,sport,"On June 25, 1983, the Lord's Cricket Ground wi...",Biography Drama History sport,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,'Allo 'Allo!,history,"In France during World War II, René Artois run...",Comedy History War history Comedy History War war,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,10 Cloverfield Lane,sci_fi,A young woman is held in an underground bunker...,Drama Horror Mystery sci_fi,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,10 Things I Hate About You,comedy,"A pretty, popular teenager can't go out on a d...",Comedy Drama Romance comedy Comedy Drama Roman...,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
df.drop(columns=["labels"], inplace=True)

In [16]:
df.head()

Unnamed: 0,title,genre,description,action,adventure,animation,biography,comedy,crime,documentary,...,musical,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western
0,'71,war,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,'83,sport,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,'Allo 'Allo!,history,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,10 Cloverfield Lane,sci_fi,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,10 Things I Hate About You,comedy,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [17]:
len(df)

4188

### Let's compute column with path to poster

In [18]:
def from_title_to_poster_path(genre, title):
    transformed_title = re.sub("\\W", "_", title.casefold()) + ".jpg"
    return "./data/posters/" + genre + "/" + transformed_title

In [19]:
poster_pathes = []

for genre, title in list(zip(df["genre"], df["title"])):
    poster_pathes.append(from_title_to_poster_path(genre, title))

In [20]:
df["poster_path"] = poster_pathes

In [21]:
df.head()

Unnamed: 0,title,genre,description,action,adventure,animation,biography,comedy,crime,documentary,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,'71,war,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,./data/posters/war/_71.jpg
1,'83,sport,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,./data/posters/sport/_83.jpg
2,'Allo 'Allo!,history,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,./data/posters/history/_allo__allo_.jpg
3,10 Cloverfield Lane,sci_fi,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,./data/posters/sci_fi/10_cloverfield_lane.jpg
4,10 Things I Hate About You,comedy,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,./data/posters/comedy/10_things_i_hate_about_y...


In [22]:
df.drop(columns=["genre"], inplace=True)

Check if dataframe not contains nan

In [23]:
df.isnull().sum()

title           0
description    15
action          0
adventure       0
animation       0
biography       0
comedy          0
crime           0
documentary     0
drama           0
family          0
fantasy         0
film_noir       0
history         0
horror          0
music           0
musical         0
mystery         0
romance         0
sci_fi          0
short           0
sport           0
superhero       0
thriller        0
war             0
western         0
poster_path     0
dtype: int64

In [26]:
df = df.dropna()

In [27]:
df.isna().sum()

title          0
description    0
action         0
adventure      0
animation      0
biography      0
comedy         0
crime          0
documentary    0
drama          0
family         0
fantasy        0
film_noir      0
history        0
horror         0
music          0
musical        0
mystery        0
romance        0
sci_fi         0
short          0
sport          0
superhero      0
thriller       0
war            0
western        0
poster_path    0
dtype: int64

In [38]:
unknown_descriptions = ["The plot is unknown at this time.", 
                        "Plot under wraps.", 
                        "Plot unknown.", 
                        "Plot under wraps", 
                        "Plot kept under wraps."]

In [39]:
df = df[~df["description"].isin(unknown_descriptions)]

In [40]:
df

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,'71,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,./data/posters/war/_71.jpg
1,'83,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,./data/posters/sport/_83.jpg
2,'Allo 'Allo!,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,./data/posters/history/_allo__allo_.jpg
3,10 Cloverfield Lane,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,./data/posters/sci_fi/10_cloverfield_lane.jpg
4,10 Things I Hate About You,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,./data/posters/comedy/10_things_i_hate_about_y...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4183,Zulu,Outnumbered British soldiers do battle with Zu...,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,./data/posters/history/zulu.jpg
4184,iCarly,"Carly hosts her own home-grown web show, iCarl...",0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,./data/posters/family/icarly.jpg
4185,"tick, tick... BOOM!","On the cusp of his 30th birthday, a promising ...",0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/biography/tick__tick____boom_.jpg
4186,¡García!,A cryogenically frozen super spy from the '60s...,1,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,./data/posters/sci_fi/_garcía_.jpg


In [41]:
df.to_csv("./data/preprocessed.csv", index=False)