In [40]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import CamembertConfig, CamembertModel, AutoTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import functools
from tqdm import tqdm
import gc

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
#Load 20 minutes articles
df_articles = pd.read_json('../newspaper_2.jsonl', lines=True)

In [3]:
def split_text_in_parts(txt):
    n = len(txt)
    prev_cursor = 0
    cursor = min(499, n-1)
    parts = []
    while prev_cursor < n-1:
        while '.' not in txt[cursor] and cursor > prev_cursor:
            cursor -= 1
        if cursor == prev_cursor:
            parts.append(txt[prev_cursor:min(prev_cursor+500, n)])
            prev_cursor = min(prev_cursor+500, n)
            cursor = min(prev_cursor+499, n-1)
        else:
            parts.append(txt[prev_cursor:cursor+1])
            prev_cursor = cursor+1
            cursor = prev_cursor+499
            if cursor >= n-1 and prev_cursor < n-1:
                parts.append(txt[prev_cursor:])
                break
    return parts

In [4]:
# Create test and train dataframes
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'sciences': 3, 'high-tech': 4, 'politique': 5}
p_train, p_test = train_test_split(df_articles[(df_articles.category_id == 'planete')], test_size=1000, train_size=3000)
s_train, s_test = train_test_split(df_articles[(df_articles.category_id == 'sport')], test_size=1000, train_size=3000)
e_train, e_test = train_test_split(df_articles[(df_articles.category_id == 'economie')], test_size=1000, train_size=3000)
sc_train, sc_test = train_test_split(df_articles[(df_articles.category_id == 'sciences')], test_size=500, train_size=2000)
h_train, h_test = train_test_split(df_articles[(df_articles.category_id == 'high-tech')], test_size=1000, train_size=3000)
po_train, po_test = train_test_split(df_articles[(df_articles.category_id == 'politique')], test_size=1000, train_size=3000)
train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train])[['title', 'category_id', 'body']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test])[['title', 'category_id', 'body']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [5]:
# Set gloabal parameters and tokenizer
MAX_LEN_BODY = 512
MAX_LEN_TITLE = 64
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [6]:
### Creation of the test dataset ###
# Creates list of texts and labels
text = test_dataset['body'].to_list()
titles = test_dataset['title'].to_list()
labels = test_dataset['label'].to_list()
body_text = []
body_labels = []
body_id = []
for i in range(len(text)):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
            body_id.append(i)
    

#user tokenizer to convert sentences into tokenizer
input_ids_titles  = tokenizer(titles, max_length=MAX_LEN_TITLE, padding='longest', truncation=True).input_ids
input_ids_body  = tokenizer(body_text, max_length=MAX_LEN_BODY, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks_title = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_titles:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks_title.append(seq_mask)

attention_masks_body = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_body:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks_body.append(seq_mask)

In [15]:
# transfrom to tensor format
test_inputs_titles = torch.tensor(input_ids_titles)
test_labels_titles = torch.tensor(labels)
test_masks_titles = torch.tensor(attention_masks_title)

# create dataloader
test_data_title = TensorDataset(test_inputs_titles, test_masks_titles, test_labels_titles)
# test_sampler = RandomSampler(test_data_title)
# print(test_sampler)
test_dataloader_title = DataLoader(test_data_title, batch_size=batch_size)

In [16]:
# transfrom to tensor format
test_inputs_body = torch.tensor(input_ids_body)
test_labels_body = torch.tensor(body_labels)
test_masks_body = torch.tensor(attention_masks_body)

# create dataloader
test_data_body = TensorDataset(test_inputs_body, test_masks_body, test_labels_body)
# test_sampler = RandomSampler(test_data_body)
# print(test_sampler)
test_dataloader_body = DataLoader(test_data_body, batch_size=batch_size)

In [13]:
# Load pretained model
model_title = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
model_title.load_state_dict(torch.load("../models/camembert_title.pt"))

# Load pretained model
model_body = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
model_body.load_state_dict(torch.load("../models/camembert_body.pt"))

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias

<All keys matched successfully>

In [17]:
# predictions with both models
with torch.no_grad():
        proba_titles = []
        for batch in tqdm(test_dataloader_title):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_title(t_data,attention_mask=t_mask).logits
            proba_titles.append(y.cpu().detach().numpy())

        proba_body = []
        for batch in tqdm(test_dataloader_body):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_body(t_data,attention_mask=t_mask).logits
            proba_body.append(y.cpu().detach().numpy())

100%|██████████| 344/344 [00:22<00:00, 15.25it/s]
100%|██████████| 446/446 [05:18<00:00,  1.40it/s]


In [22]:
squeeze_proba_body = []
for el in proba_body:
    for proba in el:
        squeeze_proba_body.append(proba)

squeeze_proba_titles = []
for el in proba_titles:
    for proba in el:
        squeeze_proba_titles.append(proba)

In [34]:
concat_proba_body[0]

array([ 4.5513406, -2.9012597,  1.3596718, -0.9744821, -1.1075795,
       -1.0569706], dtype=float32)

In [36]:
# regroup predictions for articles with long bodies
concat_proba_body = []
c = 0
for i in tqdm(range(len(squeeze_proba_titles))):
    proba = 0
    n = 0
    while c < len(body_id) and body_id[c] <= i:
        ni = len(input_ids_body[c])
        proba += ni * squeeze_proba_body[c]
        n += ni
        c += 1
    if n > 0:
        concat_proba_body.append(proba / n)
    else:
        concat_proba_body.append(np.array([0]*6))


100%|██████████| 5500/5500 [00:00<00:00, 289534.63it/s]


In [37]:
final_proba_body = np.array(concat_proba_body)
final_proba_titles = np.array(squeeze_proba_titles)

final_proba = (final_proba_body + final_proba_titles) / 2

In [38]:
y_pred = np.argmax(final_proba, axis=1)
y_pred.shape

(5500,)

In [39]:
np.sum(y_pred == labels) / len(y_pred)

0.9296363636363636

In [42]:
confusion_matrix(labels, y_pred)

array([[953,   1,  12,  19,   4,  11],
       [ 10, 976,   4,   1,   6,   3],
       [ 43,  11, 835,   6,  42,  63],
       [ 22,   1,   0, 470,   6,   1],
       [ 12,   5,  19,  30, 922,  12],
       [ 12,   2,  22,   0,   7, 957]], dtype=int64)