In [1]:
from lbl2vec import Lbl2Vec
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from tqdm import tqdm
import numpy as np
import torch
from transformers import CamembertForSequenceClassification, CamembertTokenizer

In [2]:
df_articles = pd.read_json('../newspaper_2.jsonl', lines=True)

In [14]:
txt = df_articles.iloc[3].body.split()
def split_text_in_parts(txt):
    n = len(txt)
    prev_cursor = 0
    cursor = min(499, n-1)
    parts = []
    while prev_cursor < n-1:
        while '.' not in txt[cursor] and cursor > prev_cursor:
            cursor -= 1
        if cursor == prev_cursor:
            parts.append(txt[prev_cursor:min(prev_cursor+500, n)])
            prev_cursor = min(prev_cursor+500, n)
            cursor = min(prev_cursor+499, n-1)
        else:
            parts.append(txt[prev_cursor:cursor+1])
            prev_cursor = cursor+1
            cursor = prev_cursor+499
            if cursor >= n-1 and prev_cursor < n-1:
                parts.append(txt[prev_cursor:])
                break
    return parts
# len(parts[0] + parts[1])

In [3]:
dict_labels = {'planete': 0, 'sport': 1, 'economie': 2, 'sciences': 3, 'high-tech': 4, 'politique': 5}

p_train, p_test = train_test_split(df_articles[(df_articles.category_id == 'planete')], test_size=1000, train_size=3000)
s_train, s_test = train_test_split(df_articles[(df_articles.category_id == 'sport')], test_size=1000, train_size=3000)
e_train, e_test = train_test_split(df_articles[(df_articles.category_id == 'economie')], test_size=1000, train_size=3000)
sc_train, sc_test = train_test_split(df_articles[(df_articles.category_id == 'sciences')], test_size=500, train_size=2000)
h_train, h_test = train_test_split(df_articles[(df_articles.category_id == 'high-tech')], test_size=1000, train_size=3000)
po_train, po_test = train_test_split(df_articles[(df_articles.category_id == 'politique')], test_size=1000, train_size=3000)

train_dataset = pd.concat([p_train, s_train, e_train, sc_train, h_train, po_train])[['title', 'category_id', 'body']]
train_dataset['label'] = train_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)
test_dataset = pd.concat([p_test, s_test, e_test, sc_test, h_test, po_test])[['title', 'category_id', 'body']]
test_dataset['label'] = test_dataset.apply(lambda x: dict_labels[x['category_id']], axis=1)

In [5]:
tagged_docs = []
for index, row in tqdm(train_dataset.iterrows()):
    text = row['body'].lower().split()
    tag = row['label']
    tagged_docs.append(TaggedDocument(text, [tag]))

17000it [00:01, 13471.97it/s]


In [18]:
d2v = Doc2Vec(documents=tagged_docs, epochs=40)
d2v.build_vocab(tagged_docs)
d2v.train(tagged_docs, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [4]:
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias

In [85]:
model

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0): CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
     

In [6]:
text = train_dataset.iloc[0]['body']
tok_text = torch.tensor([tokenizer(text)['input_ids']])
mask = torch.tensor([tokenizer(text)['attention_mask']])
res = model.roberta.embeddings(tok_text)

In [10]:
torch.mean(res, dim=1)[0].detach().numpy()

array([-6.82392064e-03,  2.21281033e-02, -2.82669961e-02,  1.35766715e-03,
       -1.68653838e-02,  4.03238274e-03,  2.38996297e-02,  1.26908151e-02,
        4.37366311e-03, -8.19994882e-03,  1.00634778e-02,  1.69952456e-02,
        2.23984793e-02,  9.81648546e-03,  2.67052557e-02, -8.17224309e-02,
       -3.25506292e-02,  2.00448325e-03,  3.50588337e-02, -1.91840820e-03,
       -2.34094653e-02,  1.48780467e-02,  3.16780061e-02,  2.85475682e-02,
       -3.82937863e-03, -3.12437639e-02,  1.61092672e-02, -3.28582991e-03,
       -1.96662191e-02,  2.23489106e-02,  1.71244983e-02, -2.17177197e-02,
        5.05020935e-03,  2.10455209e-02, -7.72289280e-03,  2.44424362e-02,
        3.15786749e-02,  1.71307810e-02,  1.64334029e-02, -2.36023646e-02,
       -2.24398393e-02,  2.04713549e-02, -9.90282558e-03, -1.21279322e-02,
       -1.15092127e-02, -3.07072457e-02, -1.99443679e-02, -1.00789396e-02,
        3.46288383e-02, -9.46150394e-04,  1.18784085e-02,  6.87222369e-03,
       -1.75819714e-02, -

In [19]:
gmm = GaussianMixture(n_components=6, max_iter=200)

In [16]:
train_vectors = []
y_train = []
for index, row in tqdm(train_dataset.iterrows()):
    text = row['body'].lower()
    parts = split_text_in_parts(text)
    n = 0
    rep = 0
    for part in parts:
        tok_text = torch.tensor([tokenizer(' '.join(part))['input_ids']])
        res = model.roberta.embeddings(tok_text)
        n += len(part)
        rep += len(part)*torch.mean(res, dim=1)[0].detach().numpy()
    tag = str(index)
    if n > 0:
        y_train.append(row['label'])
        train_vectors.append(rep / n)

17000it [05:23, 52.63it/s]


In [17]:
test_vectors = []
y_test = []
for index, row in tqdm(test_dataset.iterrows()):
    text = row['body'].lower()
    parts = split_text_in_parts(text)
    n = 0
    rep = 0
    for part in parts:
        tok_text = torch.tensor([tokenizer(' '.join(part))['input_ids']])
        res = model.roberta.embeddings(tok_text)
        n += len(part)
        rep += len(part)*torch.mean(res, dim=1)[0].detach().numpy()
    tag = str(index)
    if n > 0:
        y_test.append(row['label'])
        test_vectors.append(rep / n)

5500it [01:35, 57.36it/s]


In [None]:
train_vectors = []
y_train = []
for index, row in tqdm(test_dataset.iterrows()):
    text = row['body'].lower().split()
    tag = str(index)
    y_train.append(row['label'])
    train_vectors.append(d2v.infer_vector(text))

In [21]:
vectors = []
y_true = []
for index, row in tqdm(test_dataset.iterrows()):
    text = row['body'].lower().split()
    tag = str(index)
    y_true.append(row['label'])
    vectors.append(d2v.infer_vector(text))

5500it [00:57, 95.78it/s] 


In [20]:
gmm.fit(train_vectors, y_train)

In [22]:
y_pred = gmm.predict(test_vectors)

In [28]:
n = 5000
y_pred[n:n+10]

array([3, 2, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [50]:
di = {4: 0, 2: 1, 5: 2, 3: 4, 0:4, 1: 5}
y_pred_c = [di[x] for x in y_pred]
acc = np.sum(np.array(y_pred_c) == np.array(y_true)) / len(y_pred)
acc

0.49672727272727274

In [51]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(np.array(y_true), np.array(y_pred_c))
cm

array([[570,  17, 287,   0,  24, 102],
       [  9, 878,  19,   0,  63,  31],
       [ 32,  45, 571,   0, 152, 200],
       [418,   5,   9,   0,  26,  42],
       [ 55,  27,  64,   0, 669, 185],
       [  9, 256, 669,   0,  22,  44]], dtype=int64)