## Importing the libraries
---

In [None]:
! pip install accelerate>=0.21.0
! pip install -U transformers
! pip install sentencepiece
! pip install rouge_score
! pip install wandb
! pip install torch
! pip install numpy
! pip install nltk
! pip install huggingface_hub
! pip install datasets


Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.1
    Uninstalling transformers-4.38.1:
      Successfully uninstalled transformers-4.38.1
Successfully installed transformers-4.38.2
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=fe362f0b2260090b54fb526efe219910fc0ff2b7f1bb07b5c7ed0355a6b7f4f5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge

In [None]:
import transformers
import accelerate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, GenerationConfig
import numpy as np
import nltk
from huggingface_hub import login
from datasets import Dataset
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')
accelerate.__version__

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'0.27.2'

## Choosing the language

---

Pick a language. In our example, there will be a choice of English and French.

In [None]:
language = "english"

### Corpus reading

In [None]:
import ast

def lire_corpus_de_fichier(chemin_corpus):
    """
    Функция для создания корпуса из файла.
    """
    corpus = []
    with open(chemin_corpus, 'r') as fichier:
        lignes = fichier.readlines()
        for ligne in lignes:
            phrases = ast.literal_eval(ligne.strip())
            corpus.append(phrases)
    return corpus


In [None]:
corpus = lire_corpus_de_fichier('/content/sample_data/TransCasm_corpus.en.txt')

In [None]:
def calculer_tokens(corpus):
    """
    Stats sur le corpus
    """
    total_tokens = 0
    nombre_phrases = len(corpus)
    max_length = 0  # Initialiser la longueur maximale à zéro
    for phrase_pair in corpus:
        phrase = phrase_pair[0]
        traduction = phrase_pair[1]

        tokens_phrase = len(word_tokenize(phrase, language='french')) + len(word_tokenize(traduction, language='french'))
        total_tokens += tokens_phrase

        # Mettre à jour la longueur maximale si nécessaire
        max_length = max(max_length, tokens_phrase)


    moyenne_tokens = total_tokens / nombre_phrases

    return total_tokens, moyenne_tokens, max_length

# Analyse de corpus

In [None]:
total_tokens, moyenne_tokens, max_length = calculer_tokens(corpus)

print(f"Total de tokens dans le corpus : {total_tokens}")
print(f"Moyenne de tokens par phrase : {moyenne_tokens}")
print(f"Longueur maximale des phrases : {max_length}")

Total de tokens dans le corpus : 48566
Moyenne de tokens par phrase : 26.524303659202623
Longueur maximale des phrases : 66


# Transformation de corpus --> Dico --> Huffing face dataset

In [None]:
original_list = corpus
liste_satirique = []
liste_neutre = []

corpus_dict = {}
for idx, item in enumerate(original_list):
    liste_satirique.append(item[0])
    liste_neutre.append(item[1])

corpus_dict['satirique'] = liste_satirique
corpus_dict['neutre'] = liste_neutre


In [None]:
corpus_dict['neutre'][:20]

[' worst day of my life',
 ' i had a horrible day',
 ' i have never had a day worse than this in my life',
 ' conspiracy theory election announcement on the same day as the logies to make us all really confused after we already voted',
 " conspiracy theory election announcement on the same day as the logies to make us all really confused come on we haven't voted",
 ' hate how a lot of ppl quit clash for agar io',
 ' i don t appreciate that many people quit clash for agar',
 ' worst day of my life!',
 ' i have never had a worse day than this',
 ' with lyft and uber gone i guess everyone will have to go back to the transportation of last year the times of enjoyment has ended',
 " with lyft and uber gone i guess everyone will have to go back to yesteryears ' transportation life begins again",
 ' did bartolo colon hit a hr tonight ? it was mentioned everywhere on twitter',
 ' everybody is talking on twitter that bartolo colon hit a hr tonight',
 ' shoutout to everybody in lafayette and hou

In [None]:
#transfering our dico to a hugging face dataset format

dataset = Dataset.from_dict(corpus_dict)
dataset

Dataset({
    features: ['satirique', 'neutre'],
    num_rows: 1831
})

# Partie tokénisation

In [None]:
model_name = "moussaKam/barthez-orangesum-abstract"
if language == "english":
    model_name = "sshleifer/distilbart-xsum-12-3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
max_input_length = 100
max_target_length = 100

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    targets = examples["satirique"]
    inputs =examples["neutre"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
dataset_tokenized = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1831 [00:00<?, ? examples/s]



In [None]:
# Exemple du texte tokenisé
dataset_tokenized['input_ids'][0]

[0,
 2373,
 183,
 9,
 127,
 301,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
# Exemple de ce texte décodé
tokenizer.decode(dataset_tokenized['input_ids'][0])

'<s> worst day of my life</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [None]:
len(dataset_tokenized)

1831

In [None]:
# On fait le split de dataset en test et train

fr_augmented_data_tokenized = dataset.map(preprocess_function, batched=True)
fr_augmented_data_tokenized = dataset_tokenized.train_test_split(train_size=0.8, test_size=0.2)

fr_augmented_data_tokenized


Map:   0%|          | 0/1831 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['satirique', 'neutre', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1464
    })
    test: Dataset({
        features: ['satirique', 'neutre', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 367
    })
})

# Model config

In [None]:
# hugging face login
token = "TOKEN"
login(token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#del model.config.max_length
#del model.config.early_stopping

model.generation_config.early_stopping = False

#model.config.num_beams = 2
#model.config.max_new_tokens = 30

#generation_config = GenerationConfig(
#    max_new_tokens=30, num_beams = 2, do_sample=True, early_stopping=False, top_k=50, eos_token_id=model.config.eos_token_id
#)

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
#model.config

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    'sarcasm_BART_v2',  # save directory
    evaluation_strategy='epoch',
    learning_rate=2.5e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    weight_decay=0.0,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    hub_token = token,
    push_to_hub="all_checkpoints",
    report_to=None
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset= fr_augmented_data_tokenized['train'],
    eval_dataset= fr_augmented_data_tokenized['test'],
    data_collator = data_collator,
    tokenizer=tokenizer,
)

#generation_config.save_pretrained("fekpghojezpoh/sarcasm_BARThez", push_to_hub=True)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Training

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.949604
2,1.559000,0.902968
3,0.813100,0.873518
4,0.628600,0.871517
5,0.538200,0.887724


Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


TrainOutput(global_step=2440, training_loss=0.807269706100714, metrics={'train_runtime': 18865.424, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.129, 'total_flos': 361271386828800.0, 'train_loss': 0.807269706100714, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.8877236247062683,
 'eval_runtime': 182.3742,
 'eval_samples_per_second': 2.012,
 'eval_steps_per_second': 0.674,
 'epoch': 5.0}

In [None]:
trainer.push_to_hub("fekpghojezpoh/sarcasm_BART_v2")



Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

events.out.tfevents.1709558962.ae719a84a26c.219.0:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1709578010.ae719a84a26c.219.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fekpghojezpoh/sarcasm_BART_v2/commit/c80e6ec6bae9e8f7615e5b7d16e8a8a7334a620d', commit_message='fekpghojezpoh/sarcasm_BART_v2', commit_description='', oid='c80e6ec6bae9e8f7615e5b7d16e8a8a7334a620d', pr_url=None, pr_revision=None, pr_num=None)

## Test

In [None]:
def clean_and_format_text(text):
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    cleaned_text = cleaned_text.replace('<pad>', '')
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.capitalize()
    if not cleaned_text.endswith(('.', '!', '?')):
        cleaned_text += '.'
    return cleaned_text


In [None]:
def model_inference(input):
  model_inputs = tokenizer(input,  max_length=1000, padding='max_length', truncation=True)
  raw_pred, _, _ = trainer.predict([model_inputs])
  return print(clean_and_format_text(tokenizer.decode(raw_pred[0])))

In [None]:
texte_neutre = "I believe a significant number of people wouldn't love the idea of spending their tax dollars on others."


model_inference(texte_neutre)

A significant number of people wouldn't like the idea of spending their tax dollars on others.


In [None]:
texte_neutre = "This situation is so heartbreaking."


model_inference(texte_neutre)

This is the moment that will change the face of one of the world's most famous people.


In [None]:
texte_neutre = "i'm so bored lol"

model_inference(texte_neutre)

Can't wait for the rest of the year.


In [None]:
texte_neutre = "I would really enjoy seeing him at Lollapalooza"

model_inference(texte_neutre)

I would really enjoy seeing him at lollapalooza.


In [None]:
texte_neutre = "As much as I dislike the liberals, I have to admit they did the right thing by intervening here."

model_inference(texte_neutre)

As much as me dislike the liberals, i have to admit they did the right thing by intervening here.


## Эксперименты


In [None]:
from datasets import load_dataset

dataset = load_dataset("daniel2588/sarcasm")

dataset

Downloading readme:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment'],
        num_rows: 1010826
    })
})

In [None]:
count = 1
for example in dataset['train']:
    if example['label'] == 1 and count<101:
      print(count, example['comment'])
      count += 1
