In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import CamembertConfig, CamembertModel, AutoTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import functools
from tqdm import tqdm
import gc

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
#Load 20 minutes articles
df_articles = pd.read_json('../newspaper_2.jsonl', lines=True)

In [3]:
def split_text_in_parts(txt):
    """
    Split articles in part of length 500 max to be compatible with camembert model.
    """
    n = len(txt)
    prev_cursor = 0
    cursor = min(499, n-1)
    parts = []
    while prev_cursor < n-1:
        while '.' not in txt[cursor] and cursor > prev_cursor:
            cursor -= 1
        if cursor == prev_cursor:
            parts.append(txt[prev_cursor:min(prev_cursor+500, n)])
            prev_cursor = min(prev_cursor+500, n)
            cursor = min(prev_cursor+499, n-1)
        else:
            parts.append(txt[prev_cursor:cursor+1])
            prev_cursor = cursor+1
            cursor = prev_cursor+499
            if cursor >= n-1 and prev_cursor < n-1:
                parts.append(txt[prev_cursor:])
                break
    return parts

In [4]:
# Create test and train dataframes
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'sciences': 3, 'high-tech': 4, 'politique': 5}
p_train, p_test = train_test_split(df_articles[(df_articles.category_id == 'planete')], test_size=1000, train_size=3000, random_state=42)
s_train, s_test = train_test_split(df_articles[(df_articles.category_id == 'sport')], test_size=1000, train_size=3000, random_state=42)
e_train, e_test = train_test_split(df_articles[(df_articles.category_id == 'economie')], test_size=1000, train_size=3000, random_state=42)
sc_train, sc_test = train_test_split(df_articles[(df_articles.category_id == 'sciences')], test_size=500, train_size=2000, random_state=42)
h_train, h_test = train_test_split(df_articles[(df_articles.category_id == 'high-tech')], test_size=1000, train_size=3000, random_state=42)
po_train, po_test = train_test_split(df_articles[(df_articles.category_id == 'politique')], test_size=1000, train_size=3000, random_state=42)
train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train])[['title', 'category_id', 'body']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test])[['title', 'category_id', 'body']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [5]:
# Set gloabal parameters and tokenizer
MAX_LEN = 512
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [6]:
### Creation of the test dataset ###
# Creates list of texts and labels
text = test_dataset['body'].to_list()
labels = test_dataset['label'].to_list()
body_text = []
body_labels = []
for i in range(len(text)):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
    

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(body_text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

In [7]:
# transfrom to tensor format
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(body_labels)
test_masks = torch.tensor(attention_masks)

# create dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [8]:
# Creates list of texts and labels
text = train_dataset['title'].to_list()
labels = train_dataset['label'].to_list()
body_text = []
body_labels = []
for i in range(len(text)):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
    

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(body_text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

In [9]:
# transfrom to tensor format
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(body_labels)
train_masks = torch.tensor(attention_masks)

# create dataloader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [10]:
# Load pretained model
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias

In [11]:
# function to compute accuracy

def compute_accuracy(test_dataloader, model):
    total_true = 0
    total_size = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model(t_data,attention_mask=t_mask).logits
            result = torch.argmax(y, dim=-1).cpu().detach().numpy()
            nb_true, size = np.sum(result == np.array(batch[2])), len(result)
            total_true += nb_true
            total_size += size
            t_data.cpu()
            t_mask.cpu()
            del batch
            del y
            del t_data
            del t_mask
            gc.collect()
        
    accuracy = total_true / total_size
    return accuracy

In [12]:
# extract parameters to optimize
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

opt = torch.optim.Adam(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)


def train(classifier, data, epochs=20, log_interval=50):
    best_acc = 0
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for idx, batch in enumerate(data):
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            opt.zero_grad()
            y_pred = classifier(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = y_pred[0]
            loss.backward()
            opt.step()
            total_loss += loss.item()
            ## del GPU variables to keep free memory
            loss.detach().cpu()
            b_input_ids.cpu()
            b_input_mask.cpu()
            b_labels.cpu()
            del(y_pred)
            del(loss)
            del(batch)
            del(b_input_ids, b_input_mask, b_labels)
            gc.collect()
            if idx % log_interval == 0 and idx > 0:
                cur_loss = total_loss / log_interval
                print(
                    "| epoch {:3d} | {:5d}/{:5d} steps | "
                    "loss {:5.5f}".format(
                        epoch, idx, len(data), cur_loss,
                    )
                )
                losses.append(cur_loss)
                total_loss = 0
        accuracy = compute_accuracy(test_dataloader, model)
        print("Test accuracy : {:1.3f}".format(accuracy))
        # Save model if better
        if accuracy > best_acc:
            torch.save(model.state_dict(), '../models/camembert_body.pt')
            best_acc = accuracy
    return classifier

In [13]:
model = train(model, train_dataloader, 3)

| epoch   0 |    50/ 1098 steps | loss 1.68970
| epoch   0 |   100/ 1098 steps | loss 1.15655
| epoch   0 |   150/ 1098 steps | loss 0.84601
| epoch   0 |   200/ 1098 steps | loss 0.70602
| epoch   0 |   250/ 1098 steps | loss 0.59924
| epoch   0 |   300/ 1098 steps | loss 0.53821
| epoch   0 |   350/ 1098 steps | loss 0.55994
| epoch   0 |   400/ 1098 steps | loss 0.57116
| epoch   0 |   450/ 1098 steps | loss 0.50614
| epoch   0 |   500/ 1098 steps | loss 0.41025
| epoch   0 |   550/ 1098 steps | loss 0.48176
| epoch   0 |   600/ 1098 steps | loss 0.48946
| epoch   0 |   650/ 1098 steps | loss 0.47954
| epoch   0 |   700/ 1098 steps | loss 0.44145
| epoch   0 |   750/ 1098 steps | loss 0.45988
| epoch   0 |   800/ 1098 steps | loss 0.40361
| epoch   0 |   850/ 1098 steps | loss 0.39754
| epoch   0 |   900/ 1098 steps | loss 0.44728
| epoch   0 |   950/ 1098 steps | loss 0.39776
| epoch   0 |  1000/ 1098 steps | loss 0.47430
| epoch   0 |  1050/ 1098 steps | loss 0.41064


100%|██████████| 442/442 [05:59<00:00,  1.23it/s]


Test accuracy : 0.877
| epoch   1 |    50/ 1098 steps | loss 0.26882
| epoch   1 |   100/ 1098 steps | loss 0.22644
| epoch   1 |   150/ 1098 steps | loss 0.28461
| epoch   1 |   200/ 1098 steps | loss 0.28751
| epoch   1 |   250/ 1098 steps | loss 0.22755
| epoch   1 |   300/ 1098 steps | loss 0.27200
| epoch   1 |   350/ 1098 steps | loss 0.27090
| epoch   1 |   400/ 1098 steps | loss 0.26907
| epoch   1 |   450/ 1098 steps | loss 0.24045
| epoch   1 |   500/ 1098 steps | loss 0.29988
| epoch   1 |   550/ 1098 steps | loss 0.25600
| epoch   1 |   600/ 1098 steps | loss 0.27258
| epoch   1 |   650/ 1098 steps | loss 0.24662
| epoch   1 |   700/ 1098 steps | loss 0.25836
| epoch   1 |   750/ 1098 steps | loss 0.24618
| epoch   1 |   800/ 1098 steps | loss 0.30069
| epoch   1 |   850/ 1098 steps | loss 0.25951
| epoch   1 |   900/ 1098 steps | loss 0.29822
| epoch   1 |   950/ 1098 steps | loss 0.29950
| epoch   1 |  1000/ 1098 steps | loss 0.25269
| epoch   1 |  1050/ 1098 steps | loss

100%|██████████| 442/442 [05:58<00:00,  1.23it/s]


Test accuracy : 0.869
| epoch   2 |    50/ 1098 steps | loss 0.13905
| epoch   2 |   100/ 1098 steps | loss 0.14859
| epoch   2 |   150/ 1098 steps | loss 0.10529
| epoch   2 |   200/ 1098 steps | loss 0.10571
| epoch   2 |   250/ 1098 steps | loss 0.11824
| epoch   2 |   300/ 1098 steps | loss 0.15860
| epoch   2 |   350/ 1098 steps | loss 0.13644
| epoch   2 |   400/ 1098 steps | loss 0.11747
| epoch   2 |   450/ 1098 steps | loss 0.13399
| epoch   2 |   500/ 1098 steps | loss 0.19055
| epoch   2 |   550/ 1098 steps | loss 0.10289
| epoch   2 |   600/ 1098 steps | loss 0.14413
| epoch   2 |   650/ 1098 steps | loss 0.13801
| epoch   2 |   700/ 1098 steps | loss 0.13358
| epoch   2 |   750/ 1098 steps | loss 0.13630
| epoch   2 |   800/ 1098 steps | loss 0.18985
| epoch   2 |   850/ 1098 steps | loss 0.13210
| epoch   2 |   900/ 1098 steps | loss 0.11770
| epoch   2 |   950/ 1098 steps | loss 0.15944
| epoch   2 |  1000/ 1098 steps | loss 0.10543
| epoch   2 |  1050/ 1098 steps | loss

100%|██████████| 442/442 [05:59<00:00,  1.23it/s]

Test accuracy : 0.863





In [43]:
batch = next(iter(test_dataloader))
torch.argmax(model(batch[0].to(device),attention_mask=batch[1].to(device)).logits, dim=-1), batch[2]

(tensor([1, 2, 4, 2, 0, 1, 0, 5, 5, 2, 2, 2, 5, 0, 2, 1], device='cuda:0'),
 tensor([1, 2, 3, 2, 3, 4, 0, 2, 5, 2, 5, 2, 5, 0, 2, 1]))