In [27]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
!pip install -q datasets


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
from datasets import load_dataset
import re
from transformers import pipeline

# Load the dataset
dataset = load_dataset('neelblabla/enron_labeled_email-llama2-7b_finetuning')

# Shuffle the dataset and slice it
dataset = dataset['train'].shuffle(seed=42).select(range(1000))

# Define a function to transform the data
def transform_conversation(example):
    conversation_text = example.get('prompts', '')  # Get conversation text, default to empty string if 'prompts' key is missing
    
    # Extract conversation text and category
    inst_match = re.search(r'<s>\[INST\](.*?)</INST>', conversation_text, re.DOTALL)
    inst_text = inst_match.group(1).strip() if inst_match else ''
    
    category_match = re.search(r'Category:(.*?)$', conversation_text, re.DOTALL)
    category_text = category_match.group(1).strip() if category_match else ''

    # Apply the new template
    reformatted_text = f'<s>[INST] {inst_text} [/INST] {category_text} </s>'

    return {'text': reformatted_text}

# Apply the transformation
transformed_dataset = dataset.map(transform_conversation)

# Define the prompt
prompt = "I am sharing an email body with you. Based on the text in the body, you need to classify the email in one of the following eight categories: 'Company Business, Strategy, etc.'; 'Purely Personal'; 'Personal but in professional context (e.g., it was good working with you)'; 'Logistic Arrangements (meeting scheduling, technical support, etc)'; 'Employment arrangements (job seeking, hiring, recommendations, etc)'; 'Document editing/checking (collaboration)'; 'Empty message (due to missing attachment)'; 'Empty message'."

# Define the text
text = "On this same subject of Chairman Wood I am told that he met with Rep. Doug Ose (energy subcommittee chairman of the Government Reform Committee) and Ose's energy advisory board on Wednesday of this week and said similar things. He made a big deal out of a Big Mac analogy -- saying that whether you ordered one in Portland Oregon or Portland Maine the product is the same. With RTOs he supposedly said this meant if the rules are standardized the number of RTOs becomes less important. Those in the room including EPSA staff came away with the impression that Wood is backing away from only 4 RTOs based on this and other comments. Ditto on a 12/15/01 deadline. On the positive side he did say that he had been consulted on the Administration's comments on the Bingaman bill and agreed that legislation without RTO language or bundled/unbundled is OK. He understands the high risk that Congress would go the wrong way on these issues."

# Combine prompt and text
combined_text = f"{prompt} {text}"

# Load the text classification pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased", tokenizer="distilbert-base-uncased")

# Classify the text
classification_result = classifier(combined_text)

# Print the predicted category
print("Predicted category:", classification_result[0]['label'])



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Predicted category: LABEL_1


In [33]:
transformed_dataset.push_to_hub("MSA-llama7b")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Adignite/MSA-llama7b/commit/3f065953c57b8a2dbf071dcf0152861ffd1b0d17', commit_message='Upload dataset', commit_description='', oid='3f065953c57b8a2dbf071dcf0152861ffd1b0d17', pr_url=None, pr_revision=None, pr_num=None)

# # Install All the Required Packages

In [34]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.1 trl==0.4.7

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# # Import All the Required Libraries

In [36]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# # Load a llama-2-7b-chat-hf model and Train it

In [37]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "Adignite/MSA-llama7b"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-MailSense-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}


# # Load everything and start the fine-tuning process

In [39]:
# Load dataset
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



{'loss': 4.3922, 'learning_rate': 0.0001975746552556772, 'epoch': 0.1}
{'loss': 1.375, 'learning_rate': 0.00018550053929480202, 'epoch': 0.2}
{'loss': 0.1729, 'learning_rate': 0.00016449948488669639, 'epoch': 0.3}
{'loss': 0.6847, 'learning_rate': 0.000136764169663272, 'epoch': 0.4}
{'loss': 0.0568, 'learning_rate': 0.00010519038181318999, 'epoch': 0.5}
{'loss': 0.4961, 'learning_rate': 7.307467669163655e-05, 'epoch': 0.6}
{'loss': 0.2652, 'learning_rate': 4.377019014049223e-05, 'epoch': 0.7}
{'loss': 0.3019, 'learning_rate': 2.03365443542764e-05, 'epoch': 0.8}
{'loss': 0.0916, 'learning_rate': 5.22039891260262e-06, 'epoch': 0.9}
{'loss': 0.4077, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 304.856, 'train_samples_per_second': 3.28, 'train_steps_per_second': 0.82, 'train_loss': 0.824412181854248, 'epoch': 1.0}


TrainOutput(global_step=250, training_loss=0.824412181854248, metrics={'train_runtime': 304.856, 'train_samples_per_second': 3.28, 'train_steps_per_second': 0.82, 'train_loss': 0.824412181854248, 'epoch': 1.0})

In [40]:
# Save trained model
trainer.model.save_pretrained(new_model)

Check the plots on tensorboard

In [41]:
%load_ext tensorboard
%tensorboard --logdir results/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# # Use the text generation pipeline to ask questions
# # # Enter your prompt

In [54]:
from transformers import pipeline
import re

# Run text generation pipeline with our next model
prompt = "I am sharing an email body with you. Based on the text in the body, you need to classify the email in one of the following eight categories: 'Company Business, Strategy, etc.'; 'Purely Personal'; 'Personal but in professional context (e.g., it was good working with you)'; 'Logistic Arrangements (meeting scheduling, technical support, etc)'; 'Employment arrangements (job seeking, hiring, recommendations, etc)'; 'Document editing/checking (collaboration)'; 'Empty message (due to missing attachment)'; 'Empty message'."

# Load the text generation pipeline with the fine-tuned model and tokenizer
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

# Generate text based on the provided prompt
generated_text = pipe(f"<s>[INST] {prompt} [/INST]")[0]['generated_text']

# Print the generated text
print("Generated Text:", generated_text)

# Now classify the generated text into one of the categories
# You can use any classification model or function that you have
# Here's a simple example using regex pattern matching
categories = [
    "Company Business, Strategy, etc.",
    "Purely Personal",
    "Personal but in professional context (e.g., it was good working with you)",
    "Logistic Arrangements (meeting scheduling, technical support, etc)",
    "Employment arrangements (job seeking, hiring, recommendations, etc)",
    "Document editing/checking (collaboration)",
    "Empty message (due to missing attachment)",
    "Empty message"
]

# Pattern matching to classify the generated text
classified_category = None
for category in categories:
    if re.search(category, generated_text):
        classified_category = category
        break

# Print the classified category
print("Classified Category:", classified_category)


Generated Text: <s>[INST] I am sharing an email body with you. Based on the text in the body, you need to classify the email in one of the following eight categories: 'Company Business, Strategy, etc.'; 'Purely Personal'; 'Personal but in professional context (e.g., it was good working with you)'; 'Logistic Arrangements (meeting scheduling, technical support, etc)'; 'Employment arrangements (job seeking, hiring, recommendations, etc)'; 'Document editing/checking (collaboration)'; 'Empty message (due to missing attachment)'; 'Empty message'. [/INST] 
Classified Category: Company Business, Strategy, etc.


In [55]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

# # Store New Llama2 Model 

In [56]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#  Push Model to Hugging Face Hub

In [57]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [58]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [59]:
model.push_to_hub("Adignite/MailSense_Classifier-chat-llama7b", check_pr=True)

tokenizer.push_to_hub("Adignite/MailSense_Classifier-chat-llama7b",check_pr=True)



Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Adignite/MailSense_Classifier-chat-llama7b/commit/bf2374f3cf8d166c4a4ff9b83fb7620e7c0dd78f', commit_message='Upload tokenizer', commit_description='', oid='bf2374f3cf8d166c4a4ff9b83fb7620e7c0dd78f', pr_url=None, pr_revision=None, pr_num=None)