In [472]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [473]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

In [474]:
MODEL_NAME = "distilbert-base-uncased"

In [475]:
CATEGORIES = ["action", "adventure", "animation", "biography", "comedy", "crime", "documentary", "drama", "family",
              "fantasy", "film_noir", "history", "horror", "music", "musical", "mystery", "romance", "sci_fi",
              "short", "sport", "superhero", "thriller", "war", "western"]

In [476]:
train_df = pd.read_csv("./preprocessed_train.csv")
train_df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,The Expanse,"In the 24th century, a group of humans untangl...",0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,./data/posters/drama/the_expanse.jpg
1,Night Stalker: The Hunt for a Serial Killer,This limited docu-series tells the true story ...,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,./data/posters/documentary/night_stalker__the_...
2,Hannah Montana: The Movie,As Hannah Montana's popularity begins to take ...,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/music/hannah_montana__the_movie...
3,London Kills,Drama series following the detectives of an el...,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,./data/posters/crime/london_kills.jpg
4,Women's Prison,A sadistic prison warden takes out her sexual ...,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/film_noir/women_s_prison.jpg


In [477]:
test_df = pd.read_csv("./preprocessed_test.csv")
test_df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,Marie Antoinette,The retelling of France's iconic but ill-fated...,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/biography/marie_antoinette.jpg
1,An American Crime,The true story of suburban housewife Gertrude ...,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/biography/an_american_crime.jpg
2,Crime Wave,Reformed parolee Steve Lacey is caught in the ...,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/film_noir/crime_wave.jpg
3,Bad News Bears,A grizzled little league coach tries to turn h...,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,./data/posters/sport/bad_news_bears.jpg
4,The Texas Chain Saw Massacre,Five friends head out to rural Texas to visit ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/posters/horror/the_texas_chain_saw_mass...


In [478]:
labels = CATEGORIES
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film_noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci_fi',
 'short',
 'sport',
 'superhero',
 'thriller',
 'war',
 'western']

In [479]:
id2label

{0: 'action',
 1: 'adventure',
 2: 'animation',
 3: 'biography',
 4: 'comedy',
 5: 'crime',
 6: 'documentary',
 7: 'drama',
 8: 'family',
 9: 'fantasy',
 10: 'film_noir',
 11: 'history',
 12: 'horror',
 13: 'music',
 14: 'musical',
 15: 'mystery',
 16: 'romance',
 17: 'sci_fi',
 18: 'short',
 19: 'sport',
 20: 'superhero',
 21: 'thriller',
 22: 'war',
 23: 'western'}

# Build Dataset objects

In [480]:
dataset_dict = load_dataset("csv", data_files={"train": "./preprocessed_train.csv", "test": "./preprocessed_test.csv"}, split=None)



  0%|          | 0/2 [00:00<?, ?it/s]

In [481]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['title', 'description', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'film_noir', 'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi', 'short', 'sport', 'superhero', 'thriller', 'war', 'western', 'poster_path'],
        num_rows: 3332
    })
    test: Dataset({
        features: ['title', 'description', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'film_noir', 'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi', 'short', 'sport', 'superhero', 'thriller', 'war', 'western', 'poster_path'],
        num_rows: 834
    })
})

# Preprocess_data

In [482]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [483]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [484]:
def preprocess_description(examples):
  # take a batch of texts
  text = examples["description"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

def preprocess_title(examples):
  # take a batch of texts
  text = examples["title"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

### Process DESCRIPTION

In [485]:
encoded_descr_dataset_dict = dataset_dict.map(preprocess_description, batched=True, 
                                              remove_columns=dataset_dict['train'].column_names)



In [486]:
encoded_descr_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3332
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 834
    })
})

In [487]:
print(encoded_descr_dataset_dict['train'][0])

{'input_ids': [101, 1999, 1996, 13386, 2301, 1010, 1037, 2177, 1997, 4286, 4895, 23395, 1037, 6565, 5436, 2029, 17016, 1996, 5943, 2291, 1005, 1055, 13072, 2110, 1997, 20010, 15781, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [488]:
example = encoded_descr_dataset_dict['train'][0]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [489]:
tokenizer.decode(example['input_ids'])

"[CLS] in the 24th century, a group of humans untangle a vast plot which threatens the solar system's fragile state of detente. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [490]:
print(example['labels'])

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [491]:
print([id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0])

['drama', 'mystery', 'sci_fi']


In [492]:
# encoded_descr_dataset_dict.set_format("torch")

### And process TITLE

In [493]:
encoded_title_dataset_dict = dataset_dict.map(preprocess_title, batched=True, 
                                              remove_columns=dataset_dict['train'].column_names)



In [494]:
encoded_title_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3332
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 834
    })
})

In [495]:
print(encoded_title_dataset_dict['train'][0])

{'input_ids': [101, 1996, 22944, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [496]:
example = encoded_title_dataset_dict['train'][0]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [497]:
tokenizer.decode(example['input_ids'])

'[CLS] the expanse [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [498]:
print(example['labels'])

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [499]:
print([id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0])

['drama', 'mystery', 'sci_fi']


In [500]:
# encoded_title_dataset_dict.set_format("torch")

# Define models

In [501]:
descr_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "action",
    "1": "adventure",
    "2": "animation",
    "3": "biography",
    "4": "comedy",
    "5": "crime",
    "6": "documentary",
    "7": "drama",
    "8": "family",
    "9": "fantasy",
    "10": "film_noir",
    "11": "history",
    "12": "horror",
    "13": "music",
    "14": "musical",
    "15": "mystery",
    "16": "romance",
    "17": "sci_fi",
    "18": "short",
    "19": "sport",
    "20": "superhero",
    "21": "thriller",
    "22": "war",
    "23": "western"
  },
  "initializer_range": 0.02,
  "label2id": {
    "action": 0,
    "adventur

In [536]:
title_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "action",
    "1": "adventure",
    "2": "animation",
    "3": "biography",
    "4": "comedy",
    "5": "crime",
    "6": "documentary",
    "7": "drama",
    "8": "family",
    "9": "fantasy",
    "10": "film_noir",
    "11": "history",
    "12": "horror",
    "13": "music",
    "14": "musical",
    "15": "mystery",
    "16": "romance",
    "17": "sci_fi",
    "18": "short",
    "19": "sport",
    "20": "superhero",
    "21": "thriller",
    "22": "war",
    "23": "western"
  },
  "initializer_range": 0.02,
  "label2id": {
    "action": 0,
    "adventur

In [503]:
batch_size = 8
metric_name = "f1"

In [504]:
descr_args = TrainingArguments(
    f"fintuned_descr_bert",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=7e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # greater_is_better=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [533]:
title_args = TrainingArguments(
    f"fintuned_title_bert",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # greater_is_better=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [506]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Train model for descr

In [508]:
descr_trainer = Trainer(
    descr_model,
    descr_args,
    train_dataset=encoded_descr_dataset_dict["train"],
    eval_dataset=encoded_descr_dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [509]:
descr_trainer.train()

***** Running training *****
  Num examples = 3332
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2085
  Number of trainable parameters = 66971928
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.253035,0.476536,0.666868,0.040767
2,0.303900,0.241754,0.543092,0.711716,0.088729
3,0.212500,0.232526,0.586146,0.738888,0.115108
4,0.159100,0.233208,0.596338,0.744984,0.135492
5,0.124000,0.234837,0.596918,0.745983,0.136691


***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint to fintuned_descr_bert/checkpoint-417
Configuration saved in fintuned_descr_bert/checkpoint-417/config.json
Model weights saved in fintuned_descr_bert/checkpoint-417/pytorch_model.bin
tokenizer config file saved in fintuned_descr_bert/checkpoint-417/tokenizer_config.json
Special tokens file saved in fintuned_descr_bert/checkpoint-417/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint to fintuned_descr_bert/checkpoint-834
Configuration saved in fintuned_descr_bert/checkpoint-834/config.json
Model weights saved in fintuned_descr_bert/checkpoint-834/pytorch_model.bin
tokenizer config file saved in fintuned_descr_bert/checkpoint-834/tokenizer_config.json
Special tokens file saved in fintuned_descr_bert/checkpoint-834/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint t

TrainOutput(global_step=2085, training_loss=0.1964410018006103, metrics={'train_runtime': 882.7114, 'train_samples_per_second': 18.874, 'train_steps_per_second': 2.362, 'total_flos': 2207772716728320.0, 'train_loss': 0.1964410018006103, 'epoch': 5.0})

In [511]:
descr_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 834
  Batch size = 8


{'eval_loss': 0.23483683168888092,
 'eval_f1': 0.5969177338832212,
 'eval_roc_auc': 0.745983356994117,
 'eval_accuracy': 0.1366906474820144,
 'eval_runtime': 14.1348,
 'eval_samples_per_second': 59.003,
 'eval_steps_per_second': 7.428,
 'epoch': 5.0}

In [537]:
title_trainer = Trainer(
    title_model,
    title_args,
    train_dataset=encoded_title_dataset_dict["train"],
    eval_dataset=encoded_title_dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [538]:
title_trainer.train()

***** Running training *****
  Num examples = 3332
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2085
  Number of trainable parameters = 66971928


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.332494,0.288309,0.585367,0.004796
2,0.358900,0.312118,0.3532,0.61215,0.016787
3,0.296400,0.312794,0.39135,0.629666,0.035971
4,0.242500,0.318543,0.403319,0.637597,0.039568
5,0.204600,0.322182,0.412381,0.642062,0.043165


***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint to fintuned_title_bert/checkpoint-417
Configuration saved in fintuned_title_bert/checkpoint-417/config.json
Model weights saved in fintuned_title_bert/checkpoint-417/pytorch_model.bin
tokenizer config file saved in fintuned_title_bert/checkpoint-417/tokenizer_config.json
Special tokens file saved in fintuned_title_bert/checkpoint-417/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint to fintuned_title_bert/checkpoint-834
Configuration saved in fintuned_title_bert/checkpoint-834/config.json
Model weights saved in fintuned_title_bert/checkpoint-834/pytorch_model.bin
tokenizer config file saved in fintuned_title_bert/checkpoint-834/tokenizer_config.json
Special tokens file saved in fintuned_title_bert/checkpoint-834/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 834
  Batch size = 8
Saving model checkpoint t

TrainOutput(global_step=2085, training_loss=0.27229165276177497, metrics={'train_runtime': 872.7832, 'train_samples_per_second': 19.088, 'train_steps_per_second': 2.389, 'total_flos': 2207772716728320.0, 'train_loss': 0.27229165276177497, 'epoch': 5.0})

In [539]:
title_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 834
  Batch size = 8


{'eval_loss': 0.32218214869499207,
 'eval_f1': 0.4123814278582127,
 'eval_roc_auc': 0.6420618335148914,
 'eval_accuracy': 0.04316546762589928,
 'eval_runtime': 14.1346,
 'eval_samples_per_second': 59.004,
 'eval_steps_per_second': 7.429,
 'epoch': 5.0}

### Compute Final Metric

In [542]:
encoded_descr_dataset_dict.set_format("torch")

In [562]:
train_descr_preds = []
train_descr_data = encoded_descr_dataset_dict["train"]
for i in tqdm(range(len(train_descr_data["input_ids"]))):
  output = descr_model(input_ids=train_descr_data["input_ids"][i].unsqueeze(0).cuda(), labels=train_descr_data["labels"][i].unsqueeze(0).cuda())
  train_descr_preds.append(output["logits"].squeeze().cpu().detach().numpy())

100%|██████████| 3332/3332 [01:32<00:00, 36.10it/s]


In [546]:
encoded_title_dataset_dict.set_format("torch")

In [568]:
train_title_preds = []
train_title_data = encoded_title_dataset_dict["train"]
for i in tqdm(range(len(train_title_data["input_ids"]))):
  output = title_model(input_ids=train_title_data["input_ids"][i].unsqueeze(0).cuda(), labels=train_title_data["labels"][i].unsqueeze(0).cuda())
  train_title_preds.append(output["logits"].squeeze().cpu().detach().numpy())

100%|██████████| 3332/3332 [01:57<00:00, 28.45it/s]


In [569]:
test_descr_preds = []
test_descr_data = encoded_descr_dataset_dict["test"]
for i in tqdm(range(len(test_descr_data["input_ids"]))):
  output = descr_model(input_ids=test_descr_data["input_ids"][i].unsqueeze(0).cuda(), labels=test_descr_data["labels"][i].unsqueeze(0).cuda())
  test_descr_preds.append(output["logits"].squeeze().cpu().detach().numpy())

100%|██████████| 834/834 [00:19<00:00, 43.76it/s]


In [570]:
test_title_preds = []
test_title_data = encoded_title_dataset_dict["test"]
for i in tqdm(range(len(test_title_data["input_ids"]))):
  output = title_model(input_ids=test_title_data["input_ids"][i].unsqueeze(0).cuda(), labels=test_title_data["labels"][i].unsqueeze(0).cuda())
  test_title_preds.append(output["logits"].squeeze().cpu().detach().numpy())

100%|██████████| 834/834 [00:16<00:00, 49.28it/s]


## Add new columns to datasets

In [552]:
LABEL_SCORE_COLUMNS = [f"label_{item}" for item in CATEGORIES]
DESCR_SCORE_COLUMNS = [f"descr_{item}" for item in CATEGORIES]

In [551]:
train_df = pd.read_csv("./preprocessed_train.csv")
train_df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,The Expanse,"In the 24th century, a group of humans untangl...",0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,./data/posters/drama/the_expanse.jpg
1,Night Stalker: The Hunt for a Serial Killer,This limited docu-series tells the true story ...,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,./data/posters/documentary/night_stalker__the_...
2,Hannah Montana: The Movie,As Hannah Montana's popularity begins to take ...,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/music/hannah_montana__the_movie...
3,London Kills,Drama series following the detectives of an el...,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,./data/posters/crime/london_kills.jpg
4,Women's Prison,A sadistic prison warden takes out her sexual ...,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/film_noir/women_s_prison.jpg


In [565]:
train_descr_preds[0]

array([-3.5010092 , -3.4851062 , -3.239642  , -3.6733189 , -0.38212624,
       -3.596246  , -2.744984  ,  0.7243991 , -2.8762822 , -1.3265235 ,
       -5.371626  , -4.5731096 , -2.3954637 , -4.250292  , -5.290495  ,
       -1.6082507 , -1.385105  , -2.6581416 , -2.2993338 , -2.7595036 ,
       -5.324584  , -3.183506  , -5.288881  , -4.95381   ], dtype=float32)

In [566]:
train_df[DESCR_SCORE_COLUMNS] = pd.DataFrame(train_descr_preds, columns=DESCR_SCORE_COLUMNS)

In [567]:
train_df.columns

Index(['title', 'description', 'action', 'adventure', 'animation', 'biography',
       'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy',
       'film_noir', 'history', 'horror', 'music', 'musical', 'mystery',
       'romance', 'sci_fi', 'short', 'sport', 'superhero', 'thriller', 'war',
       'western', 'poster_path', 'descr_action', 'descr_adventure',
       'descr_animation', 'descr_biography', 'descr_comedy', 'descr_crime',
       'descr_documentary', 'descr_drama', 'descr_family', 'descr_fantasy',
       'descr_film_noir', 'descr_history', 'descr_horror', 'descr_music',
       'descr_musical', 'descr_mystery', 'descr_romance', 'descr_sci_fi',
       'descr_short', 'descr_sport', 'descr_superhero', 'descr_thriller',
       'descr_war', 'descr_western'],
      dtype='object')

In [571]:
train_df[LABEL_SCORE_COLUMNS] = pd.DataFrame(train_title_preds, columns=LABEL_SCORE_COLUMNS)

In [572]:
train_df.columns

Index(['title', 'description', 'action', 'adventure', 'animation', 'biography',
       'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy',
       'film_noir', 'history', 'horror', 'music', 'musical', 'mystery',
       'romance', 'sci_fi', 'short', 'sport', 'superhero', 'thriller', 'war',
       'western', 'poster_path', 'descr_action', 'descr_adventure',
       'descr_animation', 'descr_biography', 'descr_comedy', 'descr_crime',
       'descr_documentary', 'descr_drama', 'descr_family', 'descr_fantasy',
       'descr_film_noir', 'descr_history', 'descr_horror', 'descr_music',
       'descr_musical', 'descr_mystery', 'descr_romance', 'descr_sci_fi',
       'descr_short', 'descr_sport', 'descr_superhero', 'descr_thriller',
       'descr_war', 'descr_western', 'label_action', 'label_adventure',
       'label_animation', 'label_biography', 'label_comedy', 'label_crime',
       'label_documentary', 'label_drama', 'label_family', 'label_fantasy',
       'label_film_noir', 'label_

In [573]:
train_df.to_csv("./preprocessed_train_with_text.csv", index=False)

#### And for test data

In [574]:
test_df = pd.read_csv("./preprocessed_test.csv")
test_df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,mystery,romance,sci_fi,short,sport,superhero,thriller,war,western,poster_path
0,Marie Antoinette,The retelling of France's iconic but ill-fated...,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/biography/marie_antoinette.jpg
1,An American Crime,The true story of suburban housewife Gertrude ...,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/biography/an_american_crime.jpg
2,Crime Wave,Reformed parolee Steve Lacey is caught in the ...,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,./data/posters/film_noir/crime_wave.jpg
3,Bad News Bears,A grizzled little league coach tries to turn h...,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,./data/posters/sport/bad_news_bears.jpg
4,The Texas Chain Saw Massacre,Five friends head out to rural Texas to visit ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,./data/posters/horror/the_texas_chain_saw_mass...


In [575]:
test_df[DESCR_SCORE_COLUMNS] = pd.DataFrame(test_descr_preds, columns=DESCR_SCORE_COLUMNS)

In [576]:
test_df[LABEL_SCORE_COLUMNS] = pd.DataFrame(test_title_preds, columns=LABEL_SCORE_COLUMNS)

In [577]:
test_df.columns

Index(['title', 'description', 'action', 'adventure', 'animation', 'biography',
       'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy',
       'film_noir', 'history', 'horror', 'music', 'musical', 'mystery',
       'romance', 'sci_fi', 'short', 'sport', 'superhero', 'thriller', 'war',
       'western', 'poster_path', 'descr_action', 'descr_adventure',
       'descr_animation', 'descr_biography', 'descr_comedy', 'descr_crime',
       'descr_documentary', 'descr_drama', 'descr_family', 'descr_fantasy',
       'descr_film_noir', 'descr_history', 'descr_horror', 'descr_music',
       'descr_musical', 'descr_mystery', 'descr_romance', 'descr_sci_fi',
       'descr_short', 'descr_sport', 'descr_superhero', 'descr_thriller',
       'descr_war', 'descr_western', 'label_action', 'label_adventure',
       'label_animation', 'label_biography', 'label_comedy', 'label_crime',
       'label_documentary', 'label_drama', 'label_family', 'label_fantasy',
       'label_film_noir', 'label_

In [578]:
test_df.to_csv("./preprocessed_test_with_text.csv", index=False)

# Trash

In [None]:
# DON'T RUN CELLS BELOW! It's previous try :()

In [113]:
title_score = { genre : [] for genre in CATEGORIES}

for title in tqdm(df.title.values):
    result = classifier(title, CATEGORIES, multi_label=True)
    result_genres = result["labels"]
    result_score = result["scores"]
    for idx, label in enumerate(result_genres):
        title_score[label].append(result_score[idx])

title_score

  0%|          | 0/3332 [00:00<?, ?it/s]


NameError: ignored

In [None]:
description_score = { genre : [] for genre in CATEGORIES}

for description in tqdm(df.description.values):
    result = classifier(description, CATEGORIES, multi_label=True)
    result_genres = result["labels"]
    result_score = result["scores"]
    for idx, label in enumerate(result_genres):
        description_score[label].append(result_score[idx])

description_score

100%|██████████| 4188/4188 [51:07<00:00,  1.37it/s]


{'action': [0.5894603133201599,
  0.9396949410438538,
  0.7664942741394043,
  0.5340380668640137,
  0.9090719819068909,
  0.988200306892395,
  0.07439590245485306,
  0.01952914521098137,
  0.37685737013816833,
  0.9640647172927856,
  0.6199625730514526,
  0.9448952674865723,
  0.8994881510734558,
  0.8685171008110046,
  0.9265152812004089,
  0.6569570302963257,
  0.8016047477722168,
  0.9542781114578247,
  0.050145234912633896,
  0.9863902926445007,
  0.7247440218925476,
  0.9781248569488525,
  0.9713640809059143,
  0.03301435336470604,
  0.700685977935791,
  0.9602020978927612,
  0.9126410484313965,
  0.9348435401916504,
  0.9025669693946838,
  0.4624020457267761,
  0.9751262664794922,
  0.6948854327201843,
  0.8606117367744446,
  0.7643541097640991,
  0.9850932955741882,
  0.9824346899986267,
  0.6030057668685913,
  0.5266296863555908,
  0.46701475977897644,
  0.1056886613368988,
  0.43111005425453186,
  0.9471961855888367,
  0.9260693788528442,
  0.8159141540527344,
  0.945979356765

In [None]:
for label, value_list in title_score.items():
    df[f"label_{label}"] = value_list

In [None]:
for label, value_list in description_score.items():
    df[f"description_{label}"] = value_list

In [None]:
df.head()

Unnamed: 0,title,description,action,adventure,animation,biography,comedy,crime,documentary,drama,...,description_musical,description_mystery,description_romance,description_sci_fi,description_short,description_sport,description_superhero,description_thriller,description_war,description_western
0,'71,"In 1971, a young and disoriented British soldi...",1,0,0,0,0,1,0,1,...,0.000729,0.016814,0.00021,0.004747,0.346715,0.000524,0.000896,0.510935,0.619829,0.060402
1,'83,"On June 25, 1983, the Lord's Cricket Ground wi...",0,0,0,1,0,0,0,1,...,0.006129,0.000587,0.001922,0.003678,0.368192,0.99548,0.151028,0.803039,0.000593,0.058039
2,'Allo 'Allo!,"In France during World War II, René Artois run...",0,0,0,0,1,0,0,0,...,0.001905,0.925831,0.000304,0.000407,0.350758,0.000607,0.002153,0.588855,0.988317,0.001535
3,10 Cloverfield Lane,A young woman is held in an underground bunker...,0,0,0,0,0,0,0,1,...,0.000644,0.020221,0.001567,0.834033,0.278637,0.000504,0.002298,0.895916,0.083377,0.591715
4,10 Things I Hate About You,"A pretty, popular teenager can't go out on a d...",0,0,0,0,1,0,0,1,...,0.001886,0.002582,0.577846,0.011673,0.225869,0.00127,0.001472,0.337887,0.000121,0.520456


In [None]:
df.shape

(4188, 75)

In [None]:
df.drop(columns=["title", "description"], inplace=True)

In [None]:
df.to_csv("./preprocessed_text_extracted.csv", index=False)