In [1]:
import numpy as np
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from tqdm import tqdm
from utils import *

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
df_articles = load_newspaper()

In [None]:
train_dataset, test_dataset = extract_train_test_dataset(df_articles)

In [6]:
# Set gloabal parameters and tokenizer
MAX_LEN_BODY = 512
MAX_LEN_TITLE = 64
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [None]:
test_dataloader_title = dataset_to_dataloader(test_dataset, tokenizer, level='title')
test_dataloader_body, body_id, input_ids_body = dataset_to_dataloader(test_dataset, tokenizer, details=True)

In [None]:
# Load pretained titles_model
model_title = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
model_title.load_state_dict(torch.load("../models/camembert_title.pt"))

# Load pretained body_model
model_body = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
model_body.load_state_dict(torch.load("../models/camembert_body.pt"))

In [13]:
# predictions with both models
with torch.no_grad():
        proba_titles = []
        for batch in tqdm(test_dataloader_title):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_title(t_data,attention_mask=t_mask).logits.cpu().detach().numpy()
            for proba in y:
                proba_titles.append(proba)

        proba_body = []
        for batch in tqdm(test_dataloader_body):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_body(t_data,attention_mask=t_mask).logits.cpu().detach().numpy()
            for proba in y:
                proba_body.append(proba)

100%|██████████| 344/344 [00:23<00:00, 14.94it/s]
100%|██████████| 443/443 [05:16<00:00,  1.40it/s]


In [14]:
# regroup predictions for articles with long bodies (as long articles have been split before 
# the predictions because the model doesn't accept input with length above 512).
# The importance of each prediction is proportionnal to the length of each part of the article.
concat_proba_body = []
c = 0
for i in tqdm(range(len(proba_titles))):
    proba = 0
    n = 0
    while c < len(body_id) and body_id[c] <= i:
        ni = len(input_ids_body[c])
        proba += ni * proba_body[c]
        n += ni
        c += 1
    if n > 0:
        concat_proba_body.append(proba / n)
    else:
        concat_proba_body.append(np.array([0]*6))


100%|██████████| 5500/5500 [00:00<00:00, 166708.86it/s]


In [15]:
# Compute the final probability
final_proba_body = np.array(concat_proba_body)
final_proba_titles = np.array(proba_titles)

final_proba = (final_proba_body + final_proba_titles) / 2

In [16]:
from scipy.special import softmax

prob = softmax(final_proba, axis=1)