In [1]:
import os

# Set the cache directory to your preferred path
os.environ['HF_HOME'] = '/cs/student/projects2/aisd/2024/shekchu/snlp'

# Access the cache directory using the environment variable
cache_dir = os.getenv('HF_HOME', 'Cache directory not set')
print(f"Model weights are stored in: {cache_dir}")

Model weights are stored in: /cs/student/projects2/aisd/2024/shekchu/snlp


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
    device_map="cuda", 
    torch_dtype=torch.float16, 
    # torch_dtype='auto', 
    trust_remote_code=True, 
    attn_implementation="flash_attention_2",
    # quantization_config=quantization_config,
)
model.config.use_cache = False  # Disable the cache to avoid conflicts with gradient checkpointing
model.gradient_checkpointing_enable()
model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3FlashAttention2(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=307

In [8]:
from datasets import load_dataset

# Load the ConvFinQA dataset
dataset = load_dataset("FinGPT/fingpt-convfinqa")

In [9]:
# # Get the first example from the test set
# test_example = dataset['test'][0]

# # Format the input (combining instruction and input text)
# input_text = f"{test_example['instruction']}\n{test_example['input']}"

# # Tokenize input
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# # Generate response
# outputs = model.generate(
#     **inputs,
#     max_length=2000,
#     num_return_sequences=1,
#     temperature=0.1,
#     pad_token_id=tokenizer.pad_token_id,
# )

# # Decode the generated response
# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print("Input:\n", input_text)
# print("\nGenerated response:\n", generated_text)
# print("\nGround truth:\n", test_example['output'])

In [10]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch

# Create a custom dataset class
class FinQADataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Format input like we did for testing
        input_text = f"{item['instruction']}\n{item['input']}"
        target_text = item['output']
        
        # Tokenize input and target
        model_inputs = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=512)
        target = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=512)
        
        # Convert to tensor
        input_ids = torch.tensor(model_inputs['input_ids'])
        attention_mask = torch.tensor(model_inputs['attention_mask'])
        labels = torch.tensor(target['input_ids'])
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create training dataset
train_dataset = FinQADataset(dataset['train'], tokenizer)

In [None]:
# Define training arguments
# Define training arguments
training_args = TrainingArguments(
    output_dir="./phi-3-5-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Increased to help with memory
    learning_rate=2e-5,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,  # Switch to BF16 instead of FP16
    fp16=False,  # Disable FP16
    fp16_full_eval=True,  # Disable FP16 evaluation
    gradient_checkpointing=True,  # Enable gradient checkpointing
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    # processing_class=PreTrainedTokenizerFast.from_pretrained('your-model-name'),
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()