In [79]:
from transformers import CamembertForSequenceClassification, CamembertTokenizer, pipeline, CamembertModel
import torch

In [14]:
model_name = "camembert-base"

In [80]:
tokenizer = CamembertTokenizer.from_pretrained(model_name)
cam = CamembertModel.from_pretrained(model_name)

In [19]:
tweets = ["J'ai passé une super journée", "Grosses galères aujourd'hui"]

In [20]:
tweet = tweets[0]

In [21]:
tokenizer.tokenize(tweet)
tokenizer.encode(tweet)
tokenizer.decode(tokenizer.encode(tweet))

['▁J', "'", 'ai', '▁passé', '▁une', '▁super', '▁journée']

In [74]:
classifier = pipeline('sentiment-analysis', model=model_name, tokenizer=tokenizer)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

In [78]:
classifier("Il fait très beau !")

[{'label': 'LABEL_0', 'score': 0.5105077624320984}]

In [35]:
camembert_fill_mask  = pipeline("fill-mask", model=model_name, tokenizer=tokenizer)
camembert_fill_mask("Le camembert est <mask> :)")

Some weights of CamembertForMaskedLM were not initialized from the model checkpoint at camembert-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'sequence': '<s> Le camembert est délicieux :)</s>',
  'score': 0.49091020226478577,
  'token': 7200,
  'token_str': '▁délicieux'},
 {'sequence': '<s> Le camembert est excellent :)</s>',
  'score': 0.10556937754154205,
  'token': 2183,
  'token_str': '▁excellent'},
 {'sequence': '<s> Le camembert est succulent :)</s>',
  'score': 0.034533143043518066,
  'token': 26202,
  'token_str': '▁succulent'},
 {'sequence': '<s> Le camembert est meilleur :)</s>',
  'score': 0.03303135931491852,
  'token': 528,
  'token_str': '▁meilleur'},
 {'sequence': '<s> Le camembert est parfait :)</s>',
  'score': 0.030076511204242706,
  'token': 1654,
  'token_str': '▁parfait'}]

In [73]:
camembert_fill_mask("J'aime les <mask> bleus!")

[{'sequence': "<s> J'aime les yeux bleus!</s>",
  'score': 0.0990937277674675,
  'token': 605,
  'token_str': '▁yeux'},
 {'sequence': "<s> J'aime les cheveux bleus!</s>",
  'score': 0.04541785269975662,
  'token': 1277,
  'token_str': '▁cheveux'},
 {'sequence': "<s> J'aime les chats bleus!</s>",
  'score': 0.03782883286476135,
  'token': 6289,
  'token_str': '▁chats'},
 {'sequence': "<s> J'aime les poissons bleus!</s>",
  'score': 0.035177621990442276,
  'token': 4831,
  'token_str': '▁poissons'},
 {'sequence': "<s> J'aime les oiseaux bleus!</s>",
  'score': 0.026914037764072418,
  'token': 5709,
  'token_str': '▁oiseaux'}]

In [81]:
encoding = tokenizer(tweets, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [98]:
outputs = camembert(input_ids, attention_mask=attention_mask, labels=labels)

In [99]:
outputs

SequenceClassifierOutput(loss=tensor(0.7065, grad_fn=<NllLossBackward>), logits=tensor([[ 0.0557, -0.1002],
        [ 0.0349, -0.0768]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [100]:
import pandas

In [102]:
df = pandas.read_csv('french_tweets_short.csv')

In [154]:
from transformers import AdamW

In [152]:
labels = list(df.label)
texts = list(df.text)

encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
labels = torch.tensor(labels).unsqueeze(0)

In [155]:
camembert = CamembertForSequenceClassification.from_pretrained(model_name, num_labels=2)
optimizer = AdamW(camembert.parameters(), lr=1e-5)
classifier = pipeline('sentiment-analysis', model=camembert, tokenizer=tokenizer)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

In [166]:
epochs = 10

for _ in range(epochs):
    outputs = camembert(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    print(loss)
    loss.backward()
    optimizer.step()

tensor(0.4683, grad_fn=<NllLossBackward>)
tensor(0.4551, grad_fn=<NllLossBackward>)
tensor(0.4425, grad_fn=<NllLossBackward>)
tensor(0.4303, grad_fn=<NllLossBackward>)
tensor(0.4185, grad_fn=<NllLossBackward>)
tensor(0.4071, grad_fn=<NllLossBackward>)
tensor(0.3960, grad_fn=<NllLossBackward>)
tensor(0.3850, grad_fn=<NllLossBackward>)
tensor(0.3742, grad_fn=<NllLossBackward>)
tensor(0.3633, grad_fn=<NllLossBackward>)


In [167]:
classifier(texts)

[{'label': 'LABEL_0', 'score': 0.6743533611297607},
 {'label': 'LABEL_0', 'score': 0.6857601404190063},
 {'label': 'LABEL_0', 'score': 0.6726309061050415},
 {'label': 'LABEL_0', 'score': 0.6749323606491089},
 {'label': 'LABEL_0', 'score': 0.6603058576583862},
 {'label': 'LABEL_0', 'score': 0.6522192358970642},
 {'label': 'LABEL_0', 'score': 0.6360929012298584},
 {'label': 'LABEL_1', 'score': 0.7538371086120605},
 {'label': 'LABEL_1', 'score': 0.7436507940292358},
 {'label': 'LABEL_1', 'score': 0.7174726724624634},
 {'label': 'LABEL_1', 'score': 0.7531613707542419},
 {'label': 'LABEL_1', 'score': 0.674791157245636},
 {'label': 'LABEL_1', 'score': 0.6971939206123352},
 {'label': 'LABEL_1', 'score': 0.7637169361114502},
 {'label': 'LABEL_1', 'score': 0.750148355960846},
 {'label': 'LABEL_1', 'score': 0.7579358816146851}]