Script to finetune BART model
This script generates a fine tuned BART model from the fine tuning data

In [1]:
import sys
from BART_utilities import *
from transformers import Trainer, TrainingArguments
import pytorch_lightning as pl
from lightning.pytorch.loggers import TensorBoardLogger
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
airbus = pd.read_json('../../../dataset/legal_summarization/airbus_helicopters_train_set.json')

In [64]:
airbus_test =  pd.read_json('test_set.json')

In [68]:
all_V1 = pd.read_json('all_v1.json')

In [3]:
airbus = airbus.transpose()


In [65]:
airbus_test = airbus_test.transpose()


In [69]:
all_V1 = all_V1.transpose()

In [4]:
airbus = airbus.reset_index()

In [66]:
airbus_test = airbus_test.reset_index()


In [70]:
all_V1 = all_V1.reset_index()

In [None]:
airbus.head()

In [None]:
airbus.count()

In [None]:
all_V1.count()

In [9]:
airbus_train = airbus[['original_text', 'reference_summary']]

In [42]:
airbus_test_test = airbus_test[['original_text']]

In [72]:
all_v1 = all_V1[['original_text', 'reference_summary']]

In [5]:
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)

bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

In [None]:
# Add special tokens if required

new_tokens = ['<F>', '<RLC>', '<A>', '<S>', '<P>', '<R>', '<RPC>']

special_tokens_dict = {'additional_special_tokens': new_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
bart_model.resize_token_embeddings(len(tokenizer))

In [10]:
summary_data = SummaryDataModule(tokenizer, airbus_train, batch_size = 1)
model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model)

In [None]:
summary_data.prepare_data()

In [12]:
test_data = summary_data.test

In [81]:
## enregistre le test_data
test_data.to_csv('test_data_allV1.csv')

In [14]:
summary_data.setup(stage=None)


In [15]:
train_dataloader = summary_data.train_dataloader()
val_dataloader = summary_data.val_dataloader()


In [16]:
logger = TensorBoardLogger(save_dir='logs/all_v2', version=1, name="lightning_logs")


In [None]:
#ckpt_path=None
trainer = pl.Trainer(
    max_epochs=4,
    min_epochs=0,
    precision='bf16-mixed',
    num_sanity_val_steps=2,
    logger=logger,
    log_every_n_steps=50
)

In [21]:
trainer.fit(model, train_dataloader, val_dataloader)



  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 406 M 
-------------------------------------------------------
406 M     Trainable params
0         Non-trainable params
406 M     Total params
1,625.194 Total estimated model params size (MB)


Epoch 0:   0%|          | 0/330 [00:19<?, ?it/s]s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 0:  49%|████▉     | 162/330 [1:12:33<1:15:14,  0.04it/s, v_num=1]    

In [None]:
#Save the model
trainer.save_checkpoint("outputtrainedallV1.ckpt")

In [63]:
loaded_model = LitModel.load_from_checkpoint("./logs/lightning_logs/version_1/checkpoints/epoch=0-step=100.ckpt", learning_rate=2e-5, tokenizer=tokenizer, model=bart_model)


In [None]:
for i, row in airbus_test.iterrows():
    inputs = tokenizer(row['original_text'], return_tensors="pt", max_length=512, truncation=True)
    generated_text = loaded_model.generate_text(inputs, eval_beams=5)
    print(generated_text[0])
    airbus_test.at[i, 'generated_summary'] = generated_text[0]

In [27]:
test_data_generated = test_data[['generated_summary']]

In [64]:
merge_test = airbus.merge(test_data_generated, left_index=True, right_index=True, how='left')
merge_test = merge_test.dropna()
merge_test.set_index('index', inplace=True)
merge_test = merge_test.transpose()

In [68]:
airbus_test.set_index('index', inplace=True)
airbus_test = airbus_test.transpose()

In [None]:
data_dict = airbus_test.to_dict()

# Créer une structure de données appropriée pour le JSON de sortie
json_data_original = {}
for key, value in data_dict.items():
    print(key, value)
    json_data_original[key] = {
        'uid': value['uid'],
        'original_text': value['original_text'],
        'reference_summary': value['original_text']
    }
    
json_data_generated = {}
for key, value in data_dict.items():
    json_data_generated[key] = {
        'uid': value['uid'],
        'generated_summary': value['generated_summary']
    }
    
# Écrire le dictionnaire dans un fichier JSON
with open('jsonoriginal.json', 'w') as json_file:
    json.dump(json_data_original, json_file)

# Écrire le dictionnaire dans un fichier JSON
with open('jsongenerated.json', 'w') as json_file:
    json.dump(json_data_generated, json_file)
    
## python evaluate.py -r 'jsonoriginal.json' -g 'jsongenerated.json'

In [77]:
## propres métriques de scores
def scores(test):
    from rouge import Rouge
    from sentence_transformers import SentenceTransformer, util
    from bert_score import score


    model = SentenceTransformer("all-MiniLM-L6-v2")

    rouge = Rouge()

    # Définir une fonction pour calculer les scores ROUGE pour chaque ligne
    compute_rouge_scores = lambda row: rouge.get_scores(row['target'], row['source'], avg=True)
    embeggings_target = lambda row: model.encode(row['target'], convert_to_tensor=True)
    embeggings_source = lambda row: model.encode(row['source'], convert_to_tensor=True)
    cosin_score = lambda row: util.cos_sim(row['embeggings_target'], row['embeggings_source'])
    compute_bert_score = lambda row: score([row['target']], [row['source']], lang='fr')


    # Appliquer la fonction lambda à chaque ligne et stocker les scores ROUGE dans une nouvelle colonne 'scores'
    test['scores'] = test.apply(compute_rouge_scores, axis=1)
    test['embeggings_target'] = test.apply(embeggings_target, axis=1)
    test['embeggings_source'] = test.apply(embeggings_source, axis=1)
    test['cosin_score'] = test.apply(cosin_score, axis=1)
    test['bert_score'] = test.apply(compute_bert_score, axis=1)
    test['P'] = test['bert_score'].apply(lambda x: x[0].item())
    test['R'] = test['bert_score'].apply(lambda x: x[1].item())
    test['F1'] = test['bert_score'].apply(lambda x: x[2].item())

    # Définir une fonction pour extraire les valeurs ROUGE-1, ROUGE-2 et ROUGE-L pour chaque ligne
    extract_rouge_scores = lambda row: (
        row['rouge-1']['r'], row['rouge-1']['f'], row['rouge-1']['p'],
        row['rouge-2']['r'], row['rouge-2']['f'], row['rouge-2']['p'],
        row['rouge-l']['r'], row['rouge-l']['f'], row['rouge-l']['p']
    )

    # Appliquer la fonction lambda à chaque ligne et stocker les scores ROUGE dans de nouvelles colonnes
    test[['rouge-1_r', 'rouge-1_f', 'rouge-1', 'rouge-2_r', 'rouge-2_f', 'rouge-2', 'rouge-l_r', 'rouge-l_f', 'rouge-l']] = test['scores'].apply(extract_rouge_scores).apply(pd.Series)

    extract_item = lambda x: x.item()
    test['cosin_score_item'] = test['cosin_score'].apply(extract_item)
    
    score_result = {
    'rouge1': test_data['rouge-1'].mean(),
    'rouge2':test_data['rouge-2'].mean(),
    'rougel':test_data['rouge-l'].mean(),
    'bertP' : test_data['P'].mean(),
    'bertR' : test_data['R'].mean(),
    'bertF1' : test_data['F1'].mean(),
    'cosinscore' : test_data['cosin_score_item'].mean()
    
    }
    
    return score_result

In [None]:
Customer\u2019s orders are confirmed by the Seller in writing. The delivery schedule shall become effective upon receipt by the Customer of the Confirmation.

Customer\u2019s Orders shall be confirmed by the Seller in writing. The Contract shall become binding upon receipt by the Customer of the Seller\u2019s Order Confirmation and the delivery schedule shall become effective upon receipt by the Seller of the down-payment when relevant (as mentioned under article 6.2) and subject to compliance by the Customer to article 4.1.

In [None]:
Copying and/or reproducing, transmission to a third party of the Products or Services or technical information or Technical Data or training manuals without the Seller\u2019s written express approval is strictly forbidden.

Customer\u2019s Orders shall be confirmed by the Seller in writing. The Contract shall become binding upon receipt by the Customer of the Seller\u2019s Order Confirmation and the delivery schedule shall become effective upon receipt by the Seller of the down-payment when relevant (as mentioned under article 6.2) and subject to compliance by the Customer to article 4.1.