In [1]:
!nvidia-smi
!lscpu

Fri Jun 14 15:45:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.23                 Driver Version: 536.23       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1080 Ti   WDDM  | 00000000:0F:00.0  On |                  N/A |
| 23%   36C    P5              26W / 250W |   2142MiB / 11264MiB |     37%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

'lscpu' n'est pas reconnu en tant que commande interne
ou externe, un programme ex‚cutable ou un fichier de commandes.


In [2]:
%%capture
!pip install -q -U torch=='2.0.0'
!pip install -q -U accelerate=='0.25.0' peft=='0.7.1' bitsandbytes=='0.41.3.post2' transformers=='4.36.1' trl=='0.7.4'
!pip install --upgrade pandas

In [3]:
!pip install wandb --quiet

In [4]:
!pip install scikit-learn



In [5]:
import os
import warnings
import pandas as pd

import wandb
import torch
import transformers
import bitsandbytes
from datasets import Dataset
from huggingface_hub import login
from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                         AutoTokenizer,
                         BitsAndBytesConfig,
                         TrainingArguments,
                         pipeline,
                         logging,
                         TrainerCallback)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,
                            confusion_matrix,
                            classification_report)

In [6]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TOKENIZERS_PARALLELISM'] = "false"

In [7]:
warnings.filterwarnings("ignore")

In [8]:
from datasets import load_dataset
dataset = load_dataset("pszemraj/booksum-short")

train_data = dataset['train']
chapters = train_data['chapter']
summaries = train_data['summary_text']

df = pd.DataFrame({'Context': chapters, 'Response': summaries})

In [9]:
df.head()

Unnamed: 0,Context,Response
0,"\n ""Before these fields were shorn and tilled...",In another part of the forest by the river a f...
1,"\n ""Well, go thy way: thou shalt not from thi...",When the mounted party from Fort Howard approa...
2,"\n ""In such a night\n Di...","The pursuit of Magua is unsuccessful, but Hawk..."
3,"\n ""Those strains that once did sweet in Zion...",Heyward and the girls are uneasy and Gamut is ...
4,"\n ""Be gay securely...","In the stillness that follows, Heyward finds i..."


In [10]:
df.shape

(5912, 2)

In [11]:
def generate_prompt(datapoint):
    return f"""
            The following data comes from a dataset made of chapters and their summaries
            [INST] Your job is to summarize very long texts. Your task is to generate an appropriate summary based on the text given in square brackets.
            [{datapoint['Context']}][/INST]

            {datapoint['Response']}""".strip()

def generate_test_prompt(datapoint):
    return f"""
            The following data comes from a dataset made of chapters and their summaries
            [INST] Your job is to summarize very long texts. Your task is to generate an appropriate summary based on the text given in square brackets.
            [{datapoint['Context']}][/INST]""".strip()

In [12]:
X_train, X_eval = train_test_split(df, test_size=0.2, random_state=42)
X_train.shape, X_eval.shape

((4729, 2), (1183, 2))

In [13]:
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=['text'])
X_eval = pd.DataFrame(X_eval.apply(generate_test_prompt, axis=1),columns=['text'])
X_train.head()

Unnamed: 0,text
4252,The following data comes from a dataset made o...
2592,The following data comes from a dataset made o...
4778,The following data comes from a dataset made o...
2775,The following data comes from a dataset made o...
1328,The following data comes from a dataset made o...


In [14]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [15]:
train_data[0]

{'text': 'The following data comes from a dataset made of chapters and their summaries\n            [INST] Your job is to summarize very long texts. Your task is to generate an appropriate summary based on the text given in square brackets.\n            [\nThe youth fell back in the procession until the tattered soldier was\nnot in sight.  Then he started to walk on with the others.\n\nBut he was amid wounds.  The mob of men was bleeding.  Because of the\ntattered soldier\'s question he now felt that his shame could be viewed.\nHe was continually casting sidelong glances to see if the men were\ncontemplating the letters of guilt he felt burned into his brow.\n\nAt times he regarded the wounded soldiers in an envious way.  He\nconceived persons with torn bodies to be peculiarly happy.  He wished\nthat he, too, had a wound, a red badge of courage.\n\nThe spectral soldier was at his side like a stalking reproach.  The\nman\'s eyes were still fixed in a stare into the unknown.  His gray,\n

In [16]:
#from huggingface_hub import create_repo
#create_repo("clementdevarieux/local_PA4A_training")

In [17]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_hub_id = "clementdevarieux/local_PA4A_training"

from dotenv import load_dotenv
load_dotenv()
secret_label = "hf_token"
secret_value = os.getenv(secret_label)

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = False,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = compute_dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config,
    token=secret_value
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          add_eos_token=True,
                                          token=secret_value
                                         )
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [19]:
class PeftSavingCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        kwargs["model"].save_pretrained(checkpoint_path)

        if "pytorch_model.bin" in os.listdir(checkpoint_path):
            os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))

In [20]:
callbacks = [PeftSavingCallback]

In [21]:
run_name = "mistral-7b-instruct-PA4A"
output_dir = "./" + run_name

In [22]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    logging_dir = "logs",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_strategy='epoch',
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio = 0.05,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    evaluation_strategy="epoch",
    do_eval=True,
    run_name = run_name,
    push_to_hub = True,
    hub_model_id = model_hub_id,
    hub_token=secret_value,
    hub_strategy="checkpoint",
    disable_tqdm=False
)

In [23]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    callbacks=callbacks,
    packing=False,
    max_seq_length=512,
    )

Map:   0%|          | 0/4729 [00:00<?, ? examples/s]

Map:   0%|          | 0/1183 [00:00<?, ? examples/s]

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-666c49c0-35ea0fbb7b9b7a9f2c226100;c72775b8-c101-429d-b91f-4eb9e42831f9)

Invalid username or password.