In [1]:
import json
from datasets import Dataset
import pandas as pd

# Define the path to your JSON data
data_path = '/home/bizon/zns_workspace/Safety_Evaluation_After_Edit/data/Edit_data/merged_data_part_0.json'

# Load JSON data
with open(data_path, 'r') as f:
    data = json.load(f)

# Filter entries where 'source' is 'ZsRE'
filtered_data = [entry for entry in data if entry.get('source') == 'ZsRE']

# Verify the number of filtered entries
print(f"Total filtered entries: {len(filtered_data)}")


Total filtered entries: 433


In [2]:
# Extract 'prompt' and 'target_new' from each entry
extracted_data = [
    {
        'prompt': entry['prompt'],
        'target': entry['target_new']
    }
    for entry in filtered_data
    if 'prompt' in entry and 'target_new' in entry
]

# Verify the extracted data
print("Sample extracted data:")
print(extracted_data[0])


Sample extracted data:
{'prompt': 'Which family does Epaspidoceras belong to?', 'target': 'Noctuidae'}


In [3]:
# Convert the extracted data to a pandas DataFrame
df = pd.DataFrame(extracted_data)

# Optionally, inspect the DataFrame
print(df.head())

# Create a Hugging Face Dataset from the DataFrame
dataset = Dataset.from_pandas(df)

# Verify the Dataset
print(dataset)


                                              prompt  \
0         Which family does Epaspidoceras belong to?   
1                  What species is ZIC3 specific to?   
2               What voice type is Louise Grandjean?   
3               Who is listed as Wang Jipeng father?   
4  What was the name of Charlotte of Schaumburg-L...   

                       target  
0                   Noctuidae  
1                        male  
2               mezzo soprano  
3               Wang Chonghua  
4  Charlotte of Bourbon-Parma  
Dataset({
    features: ['prompt', 'target'],
    num_rows: 433
})


In [4]:
# Access the first example in the Dataset
print("First example in the Dataset:")
print(dataset[0])

# Output all column names
print("Dataset columns:", dataset.column_names)


First example in the Dataset:
{'prompt': 'Which family does Epaspidoceras belong to?', 'target': 'Noctuidae'}
Dataset columns: ['prompt', 'target']


In [5]:
# Save the Dataset to disk
dataset.save_to_disk('formatted_dataset')

# Load the Dataset from disk later
loaded_dataset = Dataset.load_from_disk('formatted_dataset')
print("Loaded Dataset:")
print(loaded_dataset)


Loaded Dataset:
Dataset({
    features: ['prompt', 'target'],
    num_rows: 433
})


In [6]:
# Define a function to concatenate prompt and target
def concatenate_prompt_target(example):
    example['text'] = f"[INST] {example['prompt']} [/INST] {example['target']}"
    return example

# Apply the function to the Dataset
dataset = dataset.map(concatenate_prompt_target)

# Verify the new structure
print(dataset[0])




0ex [00:00, ?ex/s]

{'prompt': 'Which family does Epaspidoceras belong to?', 'target': 'Noctuidae', 'text': '[INST] Which family does Epaspidoceras belong to? [/INST] Noctuidae'}


In [7]:
from transformers import AutoTokenizer
model_path='/home/bizon/zns_workspace/24_09_Evaluation/hugging_cache/llama-2-7b-chat-hf/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590'
# Load the tokenizer for LLaMA (replace with the actual model name)
tokenizer = AutoTokenizer.from_pretrained(model_path,device_map='auto')  # Replace with the correct model identifier

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization to the Dataset
tokenizer.pad_token = tokenizer.unk_token
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Verify the tokenized Dataset
print(tokenized_dataset[0])


  0%|          | 0/1 [00:00<?, ?ba/s]

{'prompt': 'Which family does Epaspidoceras belong to?', 'target': 'Noctuidae', 'text': '[INST] Which family does Epaspidoceras belong to? [/INST] Noctuidae', 'input_ids': [1, 518, 25580, 29962, 8449, 3942, 947, 14055, 4692, 333, 542, 18464, 6852, 304, 29973, 518, 29914, 25580, 29962, 1939, 22999, 3898, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
import torch
# Load the pre-trained LLaMA model
import os
os.environ['TOKENIZERS_PARALLELISM']='false'
model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Adjust based on model architecture
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

def custom_data_collator(data):
    input_ids = torch.tensor([f['input_ids'] for f in data])
    attention_mask = torch.tensor([f['attention_mask'] for f in data])
    labels = torch.tensor([f['input_ids'] for f in data])  # Assuming labels are same as input_ids for causal LM
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=10,
    save_steps=500,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Ideally, use a separate eval set
    tokenizer=tokenizer,
    data_collator=custom_data_collator
)




In [10]:

trainer.train()

Epoch,Training Loss,Validation Loss
1,4.2125,4.177972
2,4.167,4.116918
3,4.1324,4.093884


TrainOutput(global_step=327, training_loss=4.83524100860689, metrics={'train_runtime': 272.1645, 'train_samples_per_second': 4.773, 'train_steps_per_second': 1.201, 'total_flos': 6595881798205440.0, 'train_loss': 4.83524100860689, 'epoch': 3.0})

In [14]:
# Step 13: Save the Model (with LoRA adapters)
trainer.save_model('./results')

# Optional: Save LoRA adapters separately if needed
model.save_pretrained('./results/lora')

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


In [2]:
# Replace 'huggingface/llama-base' with your specific model identifier
model_path='/home/bizon/zns_workspace/24_09_Evaluation/hugging_cache/llama-2-7b-chat-hf/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590'

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# If the tokenizer does not have a pad token, you might need to set it
if tokenizer.pad_token is None:
    tokenizer.pad_token=tokenizer.unk_token
    


In [3]:
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_path)

# Load the LoRA adapters
# Ensure that the 'lora' directory path is correct
model = PeftModel.from_pretrained(base_model, './results/lora',device_map='auto')

# Important: Resize token embeddings if you added a new pad token
model.resize_token_embeddings(len(tokenizer))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding(32000, 4096)

In [4]:
model.eval()
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for inference.")
    model = torch.nn.DataParallel(model)


Using 4 GPUs for inference.


In [5]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 8. Define Inference Function
def generate_response(prompt, max_length=100, temperature=0.7, top_p=0.9, top_k=50):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            do_sample=True,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if response.startswith(prompt):
        response = response[len(prompt):].strip()
    
    return response

# 9. Example Usage
if __name__ == "__main__":
    prompts = [
        "Which family does Epaspidoceras belong to?",
        "What is the common name for the family Noctuidae?",
        "Explain the significance of Epaspidoceras in paleontology.",
        "Describe the lifecycle of a butterfly.",
        "What adaptations help nocturnal moths survive?"
    ]
    
    for prompt in prompts:
        response = generate_response(prompt)
        print(f"Prompt: {prompt}")
        print(f"Response: {response}\n")

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 15.73 GiB total capacity; 14.65 GiB already allocated; 127.69 MiB free; 14.66 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF