In [5]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

In [6]:
CATEGORIES = ["action", "adventure", "animation", "biography", "comedy", "crime", "documentary", "drama", "family",
              "fantasy", "film_noir", "history", "horror", "music", "musical", "mystery", "romance", "sci_fi",
              "short", "sport", "superhero", "thriller", "war", "western"]

In [7]:
df = pd.read_csv("preprocessed.csv")
df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,'71,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,./data/posters/war/_71.jpg
1,'83,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,./data/posters/sport/_83.jpg
2,'Allo 'Allo!,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,./data/posters/history/_allo__allo_.jpg
3,10 Cloverfield Lane,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,./data/posters/sci_fi/10_cloverfield_lane.jpg
4,10 Things I Hate About You,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,./data/posters/comedy/10_things_i_hate_about_y...


In [8]:
df.isnull().any()

title          False
description    False
action         False
adventure      False
animation      False
biography      False
comedy         False
crime          False
documentary    False
drama          False
family         False
fantasy        False
film_noir      False
history        False
horror         False
music          False
musical        False
mystery        False
romance        False
sci_fi         False
short          False
sport          False
superhero      False
thriller       False
war            False
western        False
poster_path    False
dtype: bool

In [11]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [None]:
title_score = { genre : [] for genre in CATEGORIES}

for title in tqdm(df.title.values):
    result = classifier(title, CATEGORIES, multi_label=True)
    result_genres = result["labels"]
    result_score = result["scores"]
    for idx, label in enumerate(result_genres):
        title_score[label].append(result_score[idx])

title_score

In [None]:
description_score = { genre : [] for genre in CATEGORIES}

for description in tqdm(df.description.values):
    result = classifier(description, CATEGORIES, multi_label=True)
    result_genres = result["labels"]
    result_score = result["scores"]
    for idx, label in enumerate(result_genres):
        description_score[label].append(result_score[idx])

description_score

In [14]:
for label, value_list in title_score.items():
    df[f"label_{label}"] = value_list

In [15]:
for label, value_list in description_score.items():
    df[f"description_{label}"] = value_list

In [16]:
df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,description_musical,description_mystery,description_romance,description_sci_fi,description_short,description_sport,description_superhero,description_thriller,description_war,description_western
0,'71,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,1,...,0.000729,0.016814,0.00021,0.004747,0.346715,0.000524,0.000896,0.510935,0.619829,0.060402
1,'83,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,1,...,0.006129,0.000587,0.001922,0.003678,0.368192,0.99548,0.151028,0.803039,0.000593,0.058039
2,'Allo 'Allo!,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,0,...,0.001905,0.925831,0.000304,0.000407,0.350758,0.000607,0.002153,0.588855,0.988317,0.001535
3,10 Cloverfield Lane,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,1,...,0.000644,0.020221,0.001567,0.834033,0.278637,0.000504,0.002298,0.895916,0.083377,0.591715
4,10 Things I Hate About You,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,1,...,0.001886,0.002582,0.577846,0.011673,0.225869,0.00127,0.001472,0.337887,0.000121,0.520456


In [17]:
df.shape

(4166, 75)

In [18]:
df.drop(columns=["title", "description"], inplace=True)

In [20]:
df.to_csv("preprocessed_text_extracted.csv", index=False)