In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import CamembertConfig, CamembertModel, AutoTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import functools
from tqdm import tqdm
import gc

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
#Load 20 minutes articles
df_articles = pd.read_parquet('../newspaper_2.parquet')

In [4]:
df_articles = df_articles.sort_values("article_date")
df_autres = df_articles[(df_articles.category_id != 'planete') & (df_articles.category_id != 'sport') & (df_articles.category_id != 'economie') & (df_articles.category_id != 'arts-stars') & (df_articles.category_id != 'high-tech') & (df_articles.category_id != 'politique')]
df_autres = df_autres[(df_autres.body != '') & (df_autres.title != '')]
df_autres["category_id"] = "autres"
df_articles = df_articles[(df_articles.category_id == 'planete') | (df_articles.category_id == 'sport')| (df_articles.category_id == 'economie')| (df_articles.category_id == 'arts-stars')| (df_articles.category_id == 'high-tech')| (df_articles.category_id == 'politique')]
df_articles = df_articles[(df_articles.body != '') & (df_articles.title != '')]
df_articles.groupby("category_id").count()[['title']]

Unnamed: 0_level_0,title
category_id,Unnamed: 1_level_1
arts-stars,28294
economie,41992
high-tech,13565
planete,11055
politique,16601
sport,71847


In [3]:
def split_text_in_parts(txt):
    """
    Split articles in part of length 500 max to be compatible with camembert model.
    """
    n = len(txt)
    prev_cursor = 0
    cursor = min(499, n-1)
    parts = []
    while prev_cursor < n-1:
        while '.' not in txt[cursor] and cursor > prev_cursor:
            cursor -= 1
        if cursor == prev_cursor:
            parts.append(txt[prev_cursor:min(prev_cursor+500, n)])
            prev_cursor = min(prev_cursor+500, n)
            cursor = min(prev_cursor+499, n-1)
        else:
            parts.append(txt[prev_cursor:cursor+1])
            prev_cursor = cursor+1
            cursor = prev_cursor+499
            if cursor >= n-1 and prev_cursor < n-1:
                parts.append(txt[prev_cursor:])
                break
    return parts

In [5]:
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'arts-stars': 3, 'high-tech': 4, 'politique': 5, 'autres': 6}
p_train, p_test = df_articles[(df_articles.category_id == 'planete')].iloc[:4000],  df_articles[(df_articles.category_id == 'planete')].iloc[7000:8000]
s_train, s_test = df_articles[(df_articles.category_id == 'sport')].iloc[:4000],  df_articles[(df_articles.category_id == 'sport')].iloc[7000:8000]
e_train, e_test = df_articles[(df_articles.category_id == 'economie')].iloc[:4000],  df_articles[(df_articles.category_id == 'economie')].iloc[7000:8000]
sc_train, sc_test = df_articles[(df_articles.category_id == 'arts-stars')].iloc[:4000],  df_articles[(df_articles.category_id == 'arts-stars')].iloc[7000:8000]
h_train, h_test = df_articles[(df_articles.category_id == 'high-tech')].iloc[:4000],  df_articles[(df_articles.category_id == 'high-tech')].iloc[7000:8000]
po_train, po_test = df_articles[(df_articles.category_id == 'politique')].iloc[:4000],  df_articles[(df_articles.category_id == 'politique')].iloc[7000:8000]
a_train, a_test = train_test_split(df_autres, test_size=10000, train_size=4000, random_state=42)
train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train, a_train])[['body', 'category_id']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test, a_test])[['body', 'category_id']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [6]:
# Set gloabal parameters and tokenizer
MAX_LEN_BODY = 512
# MAX_LEN_TITLE = 64
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [8]:
### Creation of the test datasets ###
# Creates list of texts and labels
text = test_dataset['body'].to_list()
# titles = test_dataset['title'].to_list()
labels = test_dataset['label'].to_list()
body_text = []
body_labels = []
body_id = []
for i in range(len(text)):
    parts = split_text_in_parts(text[i].split())
    for part in parts:
        if part != []:
            body_text.append(' '.join(part))
            body_labels.append(labels[i])
            body_id.append(i)
    

#user tokenizer to convert sentences into tokenizer
# input_ids_titles  = tokenizer(titles, max_length=MAX_LEN_TITLE, padding='longest', truncation=True).input_ids
input_ids_body  = tokenizer(body_text, max_length=MAX_LEN_BODY, padding='longest', truncation=True).input_ids

# # Create attention masks for titles dataset
# attention_masks_title = []
# # Create a mask of 1s for each token followed by 0s for padding
# for seq in input_ids_titles:
#     seq_mask = [float(i!=1) for i in seq]  
#     attention_masks_title.append(seq_mask)

# Create attention masks for body dataset
attention_masks_body = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_body:
    seq_mask = [float(i!=1) for i in seq]  
    attention_masks_body.append(seq_mask)

In [10]:
# # transfrom to tensor format
# test_inputs_titles = torch.tensor(input_ids_titles)
# test_labels_titles = torch.tensor(labels)
# test_masks_titles = torch.tensor(attention_masks_title)

# # create dataloader for body articles
# test_data_title = TensorDataset(test_inputs_titles, test_masks_titles, test_labels_titles)
# test_dataloader_title = DataLoader(test_data_title, batch_size=batch_size)

In [None]:
# transfrom to tensor format
test_inputs_body = torch.tensor(input_ids_body)
test_labels_body = torch.tensor(body_labels)
test_masks_body = torch.tensor(attention_masks_body)

# create dataloader for body articles
test_data_body = TensorDataset(test_inputs_body, test_masks_body, test_labels_body)
test_dataloader_body = DataLoader(test_data_body, batch_size=batch_size)

In [12]:
# Load pretained titles_model
# model_title = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
# model_title.load_state_dict(torch.load("../models/camembert_title_2.pt"))

# Load pretained body_model
model_body = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6).to(device)
model_body.load_state_dict(torch.load("../models/camembert_body_14K.pt"))

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

<All keys matched successfully>

In [13]:
# predictions with both models
with torch.no_grad():
        proba_titles = []
        for batch in tqdm(test_dataloader_title):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_title(t_data,attention_mask=t_mask).logits.cpu().detach().numpy()
            for proba in y:
                proba_titles.append(proba)

        proba_body = []
        for batch in tqdm(test_dataloader_body):
            t_data = batch[0].to(device)
            t_mask = batch[1].to(device)
            y = model_body(t_data,attention_mask=t_mask).logits.cpu().detach().numpy()
            for proba in y:
                proba_body.append(proba)

100%|██████████| 344/344 [00:23<00:00, 14.94it/s]
100%|██████████| 443/443 [05:16<00:00,  1.40it/s]


In [14]:
# regroup predictions for articles with long bodies (as long articles have been split before 
# the predictions because the model doesn't accept input with length above 512).
# The importance of each prediction is proportionnal to the length of each part of the article.
concat_proba_body = []
c = 0
for i in tqdm(range(len(proba_titles))):
    proba = 0
    n = 0
    while c < len(body_id) and body_id[c] <= i:
        ni = len(input_ids_body[c])
        proba += ni * proba_body[c]
        n += ni
        c += 1
    if n > 0:
        concat_proba_body.append(proba / n)
    else:
        concat_proba_body.append(np.array([0]*6))


100%|██████████| 5500/5500 [00:00<00:00, 166708.86it/s]


In [15]:
# Compute the final probability
final_proba_body = np.array(concat_proba_body)
final_proba_titles = np.array(proba_titles)

final_proba = (final_proba_body + final_proba_titles) / 2

In [16]:
from scipy.special import softmax

prob = softmax(final_proba, axis=1)

In [17]:
prob[1357]

array([0.18976273, 0.38701603, 0.1062028 , 0.10523246, 0.16362542,
       0.04816056])

In [18]:
# Extract predicted class from proba
y_pred = np.argmax(final_proba, axis=1)
y_pred.shape

(5500,)

In [19]:
predictions = [np.argwhere(prob[i] > 0.2).flatten() for i in range(len(prob))]

In [20]:
si = []
for i in range(len(predictions)):
    si.append(len(predictions[i]))
sum(si) / len(predictions), np.max(si)

(1.116, 3)

In [21]:
c = 0
for i in range(len(predictions)):
    if labels[i] in predictions[i]:
        c += 1
c / len(predictions)

0.9543636363636364

In [None]:
np.sum(y_pred == labels) / len(y_pred)

0.9190909090909091

In [None]:
confusion_matrix(labels, y_pred)

array([[925,   3,  12,  19,  14,  27],
       [  4, 982,   7,   0,   6,   1],
       [ 35,  13, 838,   4,  49,  61],
       [ 44,   2,   2, 432,  16,   4],
       [ 10,   7,  20,  32, 917,  14],
       [  8,   8,  16,   0,   7, 961]], dtype=int64)

In [22]:
dupli_text = []
dupli_labels = []
true_lab = []
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'sciences': 3, 'high-tech': 4, 'politique': 5}
inv_dic = {v:k for (k, v) in dict_labels.items()}
for i in range(len(predictions)):
    if len(predictions[i]) > 1:
        dupli_labels.append([inv_dic[k] for k in predictions[i]])
        dupli_text.append(text[i])
        true_lab.append(inv_dic[labels[i]])

In [31]:
l = 450
dupli_labels[l], true_lab[l], dupli_text[l]

(['economie', 'high-tech'],
 'high-tech',
 'La scène remonte à juin 2015. Tranquillement installés devant leurs écrans d’ordinateur, deux chercheurs en cybersécurité, Charlie Miller et Chris Valasek, avaient réussi à prendre le contrôle d’une Jeep Cherokee conduite par un journaliste via une simple adresse IP. Le monde entier découvrait alors la vulnérabilité des voitures connectées face aux risques de cyberattaques. Depuis, d’autres modèles et d’autres marques de véhicules ont également subi ce même type d’attaque-test.Alors que l’avènement des voitures autonomes approche, la cybersécurité devient donc désormais un enjeu central des mobilités connectées. C’est sur ce thème que Lennig Pedron, présidente de l’ONG suisse Icon, interviendra jeudi à Rennes lors d’une conférence organisée dans le cadre de l’événement InOut.Les véhicules sont de plus en plus connectés et donc de plus en plus vulnérables aux cyberattaques. La menace est donc réelle ?Le danger existe bien sûr. A partir du mome