In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
#Load 20 minutes articles
df_articles1 = pd.read_parquet('../newspaper_part1.parquet')
df_articles2 = pd.read_parquet('../newspaper_part2.parquet')
df_articles = pd.concat([df_articles1, df_articles2])

In [4]:
df_articles = df_articles.sort_values("article_date")
df_autres = df_articles[(df_articles.category_id != 'planete') & (df_articles.category_id != 'sport') & (df_articles.category_id != 'economie') & (df_articles.category_id != 'arts-stars') & (df_articles.category_id != 'high-tech') & (df_articles.category_id != 'politique')]
df_autres = df_autres[(df_autres.body != '') & (df_autres.title != '')]
df_autres["category_id"] = "autres"
df_articles = df_articles[(df_articles.category_id == 'planete') | (df_articles.category_id == 'sport')| (df_articles.category_id == 'economie')| (df_articles.category_id == 'arts-stars')| (df_articles.category_id == 'high-tech')| (df_articles.category_id == 'politique')]
df_articles = df_articles[(df_articles.body != '') & (df_articles.title != '')]
df_articles.groupby("category_id").count()[['title']]

Unnamed: 0_level_0,title
category_id,Unnamed: 1_level_1
arts-stars,29765
economie,91356
high-tech,23072
planete,24854
politique,41204
sport,129275


In [5]:
def split_text_in_parts(txt):
    """
    Split articles in part of length 500 max to be compatible with camembert model.
    """
    n = len(txt)
    prev_cursor = 0
    cursor = min(499, n-1)
    parts = []
    while prev_cursor < n-1:
        while '.' not in txt[cursor] and cursor > prev_cursor:
            cursor -= 1
        if cursor == prev_cursor:
            parts.append(txt[prev_cursor:min(prev_cursor+500, n)])
            prev_cursor = min(prev_cursor+500, n)
            cursor = min(prev_cursor+499, n-1)
        else:
            parts.append(txt[prev_cursor:cursor+1])
            prev_cursor = cursor+1
            cursor = prev_cursor+499
            if cursor >= n-1 and prev_cursor < n-1:
                parts.append(txt[prev_cursor:])
                break
    return parts

In [6]:
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'arts-stars': 3, 'high-tech': 4, 'politique': 5, 'autres': 6}
p_train, p_test = df_articles[(df_articles.category_id == 'planete')].iloc[:10000],  df_articles[(df_articles.category_id == 'planete')].iloc[10000:11000]
s_train, s_test = df_articles[(df_articles.category_id == 'sport')].iloc[:10000],  df_articles[(df_articles.category_id == 'sport')].iloc[10000:11000]
e_train, e_test = df_articles[(df_articles.category_id == 'economie')].iloc[:10000],  df_articles[(df_articles.category_id == 'economie')].iloc[10000:11000]
sc_train, sc_test = df_articles[(df_articles.category_id == 'arts-stars')].iloc[:10000],  df_articles[(df_articles.category_id == 'arts-stars')].iloc[10000:11000]
h_train, h_test = df_articles[(df_articles.category_id == 'high-tech')].iloc[:10000],  df_articles[(df_articles.category_id == 'high-tech')].iloc[10000:11000]
po_train, po_test = df_articles[(df_articles.category_id == 'politique')].iloc[:10000],  df_articles[(df_articles.category_id == 'politique')].iloc[10000:11000]
a_train, a_test = train_test_split(df_autres, test_size=2000, train_size=20000, random_state=42)
train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train, a_train])[['body', 'category_id']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test, a_test])[['body', 'category_id']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [7]:
# Set gloabal parameters and tokenizer
MAX_LEN = 512
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [8]:
### Creation of the test dataset ###
# Creates list of texts and labels
text = test_dataset['body'].to_list()
labels = test_dataset['label'].to_list()
body_text = []
body_labels = []
for i in range(len(text)):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
    

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(body_text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

In [12]:
len(input_ids[0])

512

In [8]:
# transfrom to tensor format
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(body_labels)
test_masks = torch.tensor(attention_masks)

# create dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [9]:
# Creates list of texts and labels
text = train_dataset['body'].to_list()
labels = train_dataset['label'].to_list()
body_text = []
body_labels = []
for i in tqdm(range(len(text))):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
    

#user tokenizer to convert sentences into tokenizer
input_ids  = tokenizer(body_text, max_length=MAX_LEN, padding='longest', truncation=True).input_ids

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in tqdm(input_ids):
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks.append(seq_mask)

100%|██████████| 80000/80000 [00:01<00:00, 45356.09it/s]
100%|██████████| 102744/102744 [00:03<00:00, 30549.79it/s]


In [10]:
# transfrom to tensor format
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(body_labels)
train_masks = torch.tensor(attention_masks)

# create dataloader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [11]:
# Load pretained model
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=7).to(device)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

In [12]:
# function to compute accuracy

def compute_accuracy(test_dataloader, model):
    total_true = 0
    total_size = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model(t_data,attention_mask=t_mask).logits
            result = torch.argmax(y, dim=-1).cpu().detach().numpy()
            nb_true, size = np.sum(result == np.array(batch[2])), len(result)
            total_true += nb_true
            total_size += size
            t_data.cpu()
            t_mask.cpu()
            del batch
            del y
            del t_data
            del t_mask
            gc.collect()
        
    accuracy = total_true / total_size
    return accuracy

In [13]:
# extract parameters to optimize
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

opt = torch.optim.Adam(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)


def train(classifier, data, epochs=20, log_interval=50):
    best_acc = 0
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for idx, batch in enumerate(data):
            print(idx)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            opt.zero_grad()
            loss = classifier(b_input_ids, attention_mask=b_input_mask, labels=b_labels)[0]
            loss.backward()
            opt.step()
            total_loss += loss.item()
            ## del GPU variables to keep free memory
            # loss.detach().cpu()
            # b_input_ids.cpu()
            # b_input_mask.cpu()
            # b_labels.cpu()
            del(loss)
            del(batch)
            del(b_input_ids, b_input_mask, b_labels)
            torch.cuda.empty_cache()
            gc.collect()
            if idx % log_interval == 0 and idx > 0:
                cur_loss = total_loss / log_interval
                print(
                    "| epoch {:3d} | {:5d}/{:5d} steps | "
                    "loss {:5.5f}".format(
                        epoch, idx, len(data), cur_loss,
                    )
                )
                losses.append(cur_loss)
                total_loss = 0
        accuracy = compute_accuracy(test_dataloader, model)
        print("Test accuracy : {:1.3f}".format(accuracy))
        # Save model if better
        if accuracy > best_acc:
            torch.save(model.state_dict(), '../models/camembert_body_full.pt')
            best_acc = accuracy
    return classifier

In [None]:
model = train(model, train_dataloader, 2)

In [16]:
torch.save(model.state_dict(), '../models/camembert_body_2f.pt')