In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import CamembertConfig, CamembertModel, AutoTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import functools
from tqdm import tqdm
import gc
from utils import *

tqdm.pandas()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
df_articles = load_newspaper()

In [None]:
df_articles = df_articles[(df_articles.body != '') & (df_articles.title != '')]

In [8]:
# Set gloabal parameters and tokenizer
MAX_LEN = 512
batch_size = 16
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

In [9]:
test_dataloader, body_id, input_ids = dataset_to_dataloader(df_articles, tokenizer, level="body", details=True, labels=False)

In [11]:
# Load pretained model
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=11).to(device)
model.load_state_dict(torch.load("../models/camembert_11.pt"))

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

<All keys matched successfully>

In [12]:
i = 0
with torch.no_grad():
    predic = np.ones((len(body_text), 11))
    for batch in tqdm(test_dataloader):
        t_data = batch[0].to(device)
        t_mask = batch[1].to(device)
        y = model(t_data,attention_mask=t_mask).logits
        result = y.cpu().detach().numpy()
        predic[i*batch_size:i*batch_size+len(result)] = result
        i += 1

100%|██████████| 1012/1012 [12:04<00:00,  1.40it/s]


In [13]:
from scipy.special import softmax

prob = softmax(predic, axis=1)

concat_proba_body = []
c = 0
for i in tqdm(range(max(body_id)+1)):
    proba = 0
    n = 0
    while c < len(body_id) and body_id[c] <= i:
        ni = len(input_ids[c])
        proba += ni * prob[c]
        n += ni
        c += 1
    if n > 0:
        concat_proba_body.append(proba / n)
    else:
        concat_proba_body.append(np.array([0]*11))

prob = np.array(concat_proba_body)

100%|██████████| 8183/8183 [00:00<00:00, 97920.70it/s]


In [14]:
df_articles["planete"] = prob[:, 0]
df_articles["sport"] = prob[:, 1]
df_articles["economie"] = prob[:, 2]
df_articles["arts-stars"] = prob[:, 3]
df_articles["high-tech"] = prob[:, 4]
df_articles["politique"] = prob[:, 5]
df_articles["monde"] = prob[:, 6]
df_articles["societe"] = prob[:, 7]
df_articles["faits_divers"] = prob[:, 8]
df_articles["sante"] = prob[:, 9]
df_articles["justice"] = prob[:, 10]

In [15]:
df_articles.to_parquet("../data/predictions_proba.parquet")