In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import CamembertConfig, CamembertModel, AutoTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import functools
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
#Load 20 minutes articles
df_articles = pd.read_json('../newspaper_2.jsonl', lines=True)
df_articles.category_id.unique()

array(['monde', 'france', 'politique', ...,
       '920103-20120420-maintien-portee-bras',
       '920099-20120420-campagne-passionne',
       '920105-20120420-integration-pleine-promesses-mevlut-erding'],
      dtype=object)

In [23]:
df_articles = df_articles[(df_articles.title.str.len() > 0)]

In [24]:
# Create test and train dataframes
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'sciences': 3, 'high-tech': 4, 'politique': 5}
p_train, p_test = train_test_split(df_articles[(df_articles.category_id == 'planete')], test_size=1000, train_size=3000)
s_train, s_test = train_test_split(df_articles[(df_articles.category_id == 'sport')], test_size=1000, train_size=3000)
e_train, e_test = train_test_split(df_articles[(df_articles.category_id == 'economie')], test_size=1000, train_size=3000)
sc_train, sc_test = train_test_split(df_articles[(df_articles.category_id == 'sciences')], test_size=500, train_size=2000)
h_train, h_test = train_test_split(df_articles[(df_articles.category_id == 'high-tech')], test_size=1000, train_size=3000)
po_train, po_test = train_test_split(df_articles[(df_articles.category_id == 'politique')], test_size=1000, train_size=3000)
train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train])[['title', 'category_id']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test])[['title', 'category_id']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [25]:
# Set gloabal parameters and tokenizer
MAX_LEN = 64
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [26]:
### Creation of the test dataset ###
# Creates list of texts and labels
text = test_dataset['title'].to_list()
labels = test_dataset['label'].to_list()

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

In [27]:
# transfrom to tensor format
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

# create dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [28]:
# Creates list of texts and labels
text = train_dataset['title'].to_list()
labels = train_dataset['label'].to_list()

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

In [29]:
# transfrom to tensor format
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)

# create dataloader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [30]:
# Load pretained model
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

In [6]:
# deprecated
class Classifier(nn.Module):
    def __init__(self, input_dim=768, out_dim=3):
        super(Classifier, self).__init__()
        self.dense = nn.Linear(input_dim,out_dim, bias=True)
        self.activation = nn.Softmax(dim=-1)
    def forward(self, x):
        x = torch.mean(x, dim=-2)
        x = self.dense(x)
        return self.activation(x)

classif_layer = Classifier()

In [31]:
# function to compute accuracy

def compute_accuracy(test_dataloader, model):
    total_true = 0
    total_size = 0
    for batch in tqdm(test_dataloader):
        y = model(batch[0].to(device),attention_mask=batch[1].to(device)).logits
        result = torch.argmax(y, dim=-1).cpu().detach().numpy()
        nb_true, size = np.sum(result == np.array(batch[2])), len(result)
        total_true += nb_true
        total_size += size
    accuracy = total_true / total_size
    return accuracy

In [32]:
import gc

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

opt = torch.optim.Adam(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)


def train(classifier, data, epochs=20, log_interval=50):
    classifier.to(device)
    
    classifier.train()
    criterion = nn.CrossEntropyLoss()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for idx, batch in enumerate(data):
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            opt.zero_grad()
            y_pred = classifier(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = y_pred[0]
            loss.backward()
            opt.step()
            total_loss += loss.item()
            del(y_pred)
            del(loss)
            del(batch)
            del(b_input_ids, b_input_mask, b_labels)
            gc.collect()
            if idx % log_interval == 0 and idx > 0:
                cur_loss = total_loss / log_interval
                print(
                    "| epoch {:3d} | {:5d}/{:5d} steps | "
                    "loss {:5.5f}".format(
                        epoch, idx, len(data), cur_loss,
                    )
                )
                losses.append(cur_loss)
                total_loss = 0
        accuracy = compute_accuracy(test_dataloader, model)
        print("Test accuracy : {:1.3f}".format(accuracy))
    return classifier

In [33]:
model = train(model, train_dataloader, 4)

| epoch   0 |    50/ 1063 steps | loss 1.74396
| epoch   0 |   100/ 1063 steps | loss 1.33060
| epoch   0 |   150/ 1063 steps | loss 1.02518
| epoch   0 |   200/ 1063 steps | loss 0.82199
| epoch   0 |   250/ 1063 steps | loss 0.62989
| epoch   0 |   300/ 1063 steps | loss 0.62657
| epoch   0 |   350/ 1063 steps | loss 0.57206
| epoch   0 |   400/ 1063 steps | loss 0.58551
| epoch   0 |   450/ 1063 steps | loss 0.52673
| epoch   0 |   500/ 1063 steps | loss 0.55081
| epoch   0 |   550/ 1063 steps | loss 0.46289
| epoch   0 |   600/ 1063 steps | loss 0.52210
| epoch   0 |   650/ 1063 steps | loss 0.45767
| epoch   0 |   700/ 1063 steps | loss 0.39977
| epoch   0 |   750/ 1063 steps | loss 0.47930
| epoch   0 |   800/ 1063 steps | loss 0.36456
| epoch   0 |   850/ 1063 steps | loss 0.49081
| epoch   0 |   900/ 1063 steps | loss 0.40859
| epoch   0 |   950/ 1063 steps | loss 0.44712
| epoch   0 |  1000/ 1063 steps | loss 0.46418
| epoch   0 |  1050/ 1063 steps | loss 0.50876


100%|██████████| 344/344 [00:20<00:00, 16.50it/s]

Test accuracy : 0.858





In [36]:
model = train(model, train_dataloader, 4)

| epoch   0 |    50/ 1063 steps | loss 0.32294
| epoch   0 |   100/ 1063 steps | loss 0.35047
| epoch   0 |   150/ 1063 steps | loss 0.28466
| epoch   0 |   200/ 1063 steps | loss 0.36647
| epoch   0 |   250/ 1063 steps | loss 0.35946
| epoch   0 |   300/ 1063 steps | loss 0.31606
| epoch   0 |   350/ 1063 steps | loss 0.31042
| epoch   0 |   400/ 1063 steps | loss 0.31099
| epoch   0 |   450/ 1063 steps | loss 0.30681
| epoch   0 |   500/ 1063 steps | loss 0.27846
| epoch   0 |   550/ 1063 steps | loss 0.35523
| epoch   0 |   600/ 1063 steps | loss 0.34284
| epoch   0 |   650/ 1063 steps | loss 0.33361
| epoch   0 |   700/ 1063 steps | loss 0.29429
| epoch   0 |   750/ 1063 steps | loss 0.32030
| epoch   0 |   800/ 1063 steps | loss 0.40919
| epoch   0 |   850/ 1063 steps | loss 0.31396
| epoch   0 |   900/ 1063 steps | loss 0.28274
| epoch   0 |   950/ 1063 steps | loss 0.39852
| epoch   0 |  1000/ 1063 steps | loss 0.30360
| epoch   0 |  1050/ 1063 steps | loss 0.31597


100%|██████████| 344/344 [00:21<00:00, 15.67it/s]


Test accuracy : 0.873
| epoch   1 |    50/ 1063 steps | loss 0.20977
| epoch   1 |   100/ 1063 steps | loss 0.20234
| epoch   1 |   150/ 1063 steps | loss 0.19773
| epoch   1 |   200/ 1063 steps | loss 0.20737
| epoch   1 |   250/ 1063 steps | loss 0.24493
| epoch   1 |   300/ 1063 steps | loss 0.22667
| epoch   1 |   350/ 1063 steps | loss 0.26726
| epoch   1 |   400/ 1063 steps | loss 0.19093
| epoch   1 |   450/ 1063 steps | loss 0.21684
| epoch   1 |   500/ 1063 steps | loss 0.24264
| epoch   1 |   550/ 1063 steps | loss 0.22874
| epoch   1 |   600/ 1063 steps | loss 0.24548
| epoch   1 |   650/ 1063 steps | loss 0.22909
| epoch   1 |   700/ 1063 steps | loss 0.25723
| epoch   1 |   750/ 1063 steps | loss 0.19324
| epoch   1 |   800/ 1063 steps | loss 0.22111
| epoch   1 |   850/ 1063 steps | loss 0.19182
| epoch   1 |   900/ 1063 steps | loss 0.21769
| epoch   1 |   950/ 1063 steps | loss 0.19969
| epoch   1 |  1000/ 1063 steps | loss 0.21325
| epoch   1 |  1050/ 1063 steps | loss

100%|██████████| 344/344 [00:20<00:00, 17.11it/s]


Test accuracy : 0.871
| epoch   2 |    50/ 1063 steps | loss 0.15167
| epoch   2 |   100/ 1063 steps | loss 0.15312
| epoch   2 |   150/ 1063 steps | loss 0.14411
| epoch   2 |   200/ 1063 steps | loss 0.14820
| epoch   2 |   250/ 1063 steps | loss 0.17382
| epoch   2 |   300/ 1063 steps | loss 0.15113
| epoch   2 |   350/ 1063 steps | loss 0.18655
| epoch   2 |   400/ 1063 steps | loss 0.15068
| epoch   2 |   450/ 1063 steps | loss 0.11083
| epoch   2 |   500/ 1063 steps | loss 0.16563
| epoch   2 |   550/ 1063 steps | loss 0.15788
| epoch   2 |   600/ 1063 steps | loss 0.13256
| epoch   2 |   650/ 1063 steps | loss 0.17209
| epoch   2 |   700/ 1063 steps | loss 0.17870
| epoch   2 |   750/ 1063 steps | loss 0.19622
| epoch   2 |   800/ 1063 steps | loss 0.13628
| epoch   2 |   850/ 1063 steps | loss 0.12482
| epoch   2 |   900/ 1063 steps | loss 0.14070
| epoch   2 |   950/ 1063 steps | loss 0.13486
| epoch   2 |  1000/ 1063 steps | loss 0.17198
| epoch   2 |  1050/ 1063 steps | loss

100%|██████████| 344/344 [00:20<00:00, 16.83it/s]


Test accuracy : 0.872
| epoch   3 |    50/ 1063 steps | loss 0.13312
| epoch   3 |   100/ 1063 steps | loss 0.11788
| epoch   3 |   150/ 1063 steps | loss 0.10343
| epoch   3 |   200/ 1063 steps | loss 0.07253
| epoch   3 |   250/ 1063 steps | loss 0.08438
| epoch   3 |   300/ 1063 steps | loss 0.08621
| epoch   3 |   350/ 1063 steps | loss 0.07246
| epoch   3 |   400/ 1063 steps | loss 0.12501
| epoch   3 |   450/ 1063 steps | loss 0.10974
| epoch   3 |   500/ 1063 steps | loss 0.08955
| epoch   3 |   550/ 1063 steps | loss 0.13055
| epoch   3 |   600/ 1063 steps | loss 0.07620
| epoch   3 |   650/ 1063 steps | loss 0.10776
| epoch   3 |   700/ 1063 steps | loss 0.10448
| epoch   3 |   750/ 1063 steps | loss 0.11913
| epoch   3 |   800/ 1063 steps | loss 0.10788
| epoch   3 |   850/ 1063 steps | loss 0.12941
| epoch   3 |   900/ 1063 steps | loss 0.12349
| epoch   3 |   950/ 1063 steps | loss 0.12325
| epoch   3 |  1000/ 1063 steps | loss 0.17177
| epoch   3 |  1050/ 1063 steps | loss

100%|██████████| 344/344 [00:20<00:00, 17.00it/s]

Test accuracy : 0.869





In [43]:
batch = next(iter(test_dataloader))
torch.argmax(model(batch[0].to(device),attention_mask=batch[1].to(device)).logits, dim=-1), batch[2]

(tensor([1, 2, 4, 2, 0, 1, 0, 5, 5, 2, 2, 2, 5, 0, 2, 1], device='cuda:0'),
 tensor([1, 2, 3, 2, 3, 4, 0, 2, 5, 2, 5, 2, 5, 0, 2, 1]))