In [21]:
import pandas as pd
import re

In [22]:
CATEGORIES = ["action", "adventure", "animation", "biography", "comedy", "crime", "documentary", "drama", "family",
              "fantasy", "film_noir", "history", "horror", "music", "musical", "mystery", "romance", "sci_fi",
              "short", "sport", "superhero", "thriller", "war", "western"]

In [23]:
dataframes = []

for genre in CATEGORIES:
    df = pd.read_csv(f"./data/{genre}_crawled_data.csv")
    dataframes.append(df)

df = pd.concat(dataframes)
df.head()

Unnamed: 0,genre,title,poster_link,description,labels
0,action,House of the Dragon,https://m.media-amazon.com/images/M/MV5BZjBiOG...,An internal succession war within House Targar...,Action Adventure Drama
1,action,The Lord of the Rings: The Rings of Power,https://m.media-amazon.com/images/M/MV5BNTg3Nj...,Epic drama set thousands of years before the e...,Action Adventure Drama
2,action,Black Adam,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ...",Action Adventure Fantasy
3,action,Andor,https://m.media-amazon.com/images/M/MV5BNDgxNT...,Prequel series to Star Wars' 'Rogue One'. In a...,Action Adventure Drama
4,action,The School for Good and Evil,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha find themselves...,Action Comedy Drama


In [24]:
len(df)

7200

### Convert labels string to one hot encoding

Add superhero label to labels (because IMDB guys to crazy)

In [25]:
labels = df.labels.values
genres = df.genre.values

df.labels = labels + " " + genres

Convert labels to one got encoding

In [27]:
set_of_all_labels = set()

for item in df.labels.apply(lambda x: list(x.split())):
    set_of_all_labels.update(item)

print(len(set_of_all_labels))
set_of_all_labels

51


{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western'}

In [28]:
def convert_string_to_genre(string):
    return string.lower().replace("-", "_")

In [29]:
all_labels = [convert_string_to_genre(item) for item in set_of_all_labels]
all_labels.extend(df.genre.unique())
all_labels

['news',
 'drama',
 'comedy',
 'music',
 'romance',
 'comedy',
 'thriller',
 'music',
 'fantasy',
 'western',
 'film_noir',
 'thriller',
 'mystery',
 'documentary',
 'horror',
 'history',
 'family',
 'horror',
 'game_show',
 'family',
 'animation',
 'sci_fi',
 'adventure',
 'biography',
 'short',
 'short',
 'crime',
 'sci_fi',
 'sport',
 'reality_tv',
 'war',
 'action',
 'musical',
 'documentary',
 'western',
 'drama',
 'mystery',
 'musical',
 'war',
 'animation',
 'adventure',
 'fantasy',
 'romance',
 'crime',
 'biography',
 'history',
 'sport',
 'action',
 'superhero',
 'film_noir',
 'talk_show',
 'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western']

In [30]:
existed_labels = list(set(CATEGORIES).intersection(all_labels))
existed_labels.sort()
print(len(existed_labels))
existed_labels

24


['action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western']

In [31]:
one_hots = []

for item in df.labels:
    one_hot = [1 if genre in convert_string_to_genre(item) else 0 for genre in existed_labels]
    one_hots.append(one_hot)

In [32]:
df[existed_labels] = one_hots

In [33]:
df.head()

Unnamed: 0,genre,title,poster_link,description,labels,action,adventure,animation,biography,comedy,...,musical,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western
0,action,House of the Dragon,https://m.media-amazon.com/images/M/MV5BZjBiOG...,An internal succession war within House Targar...,Action Adventure Drama action,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,action,The Lord of the Rings: The Rings of Power,https://m.media-amazon.com/images/M/MV5BNTg3Nj...,Epic drama set thousands of years before the e...,Action Adventure Drama action,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,action,Black Adam,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ...",Action Adventure Fantasy action,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,action,Andor,https://m.media-amazon.com/images/M/MV5BNDgxNT...,Prequel series to Star Wars' 'Rogue One'. In a...,Action Adventure Drama action,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,action,The School for Good and Evil,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha find themselves...,Action Comedy Drama action,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df.drop(columns=["labels"], inplace=True)

In [35]:
df.head()

Unnamed: 0,genre,title,poster_link,description,action,adventure,animation,biography,comedy,crime,...,musical,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western
0,action,House of the Dragon,https://m.media-amazon.com/images/M/MV5BZjBiOG...,An internal succession war within House Targar...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,action,The Lord of the Rings: The Rings of Power,https://m.media-amazon.com/images/M/MV5BNTg3Nj...,Epic drama set thousands of years before the e...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,action,Black Adam,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ...",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,action,Andor,https://m.media-amazon.com/images/M/MV5BNDgxNT...,Prequel series to Star Wars' 'Rogue One'. In a...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,action,The School for Good and Evil,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha find themselves...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Let's compute column with path to poster

In [36]:
def from_title_to_poster_path(genre, title):
    transformed_title = re.sub("\\W", "_", title.casefold()) + ".jpg"
    return "./data/" + genre + "/" + transformed_title

In [37]:
poster_pathes = []

for genre, title in list(zip(df["genre"], df["title"])):
    poster_pathes.append(from_title_to_poster_path(genre, title))

In [38]:
df["poster_path"] = poster_pathes

In [39]:
df.head()

Unnamed: 0,genre,title,poster_link,description,action,adventure,animation,biography,comedy,crime,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,action,House of the Dragon,https://m.media-amazon.com/images/M/MV5BZjBiOG...,An internal succession war within House Targar...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/action/house_of_the_dragon.jpg
1,action,The Lord of the Rings: The Rings of Power,https://m.media-amazon.com/images/M/MV5BNTg3Nj...,Epic drama set thousands of years before the e...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/action/the_lord_of_the_rings__the_rings...
2,action,Black Adam,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ...",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/action/black_adam.jpg
3,action,Andor,https://m.media-amazon.com/images/M/MV5BNDgxNT...,Prequel series to Star Wars' 'Rogue One'. In a...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/action/andor.jpg
4,action,The School for Good and Evil,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha find themselves...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,./data/action/the_school_for_good_and_evil.jpg


In [41]:
df.drop(columns=["genre", "poster_link"], inplace=True)

In [42]:
df.to_csv("./data/preprocessed.csv", index=False)