<a href="https://colab.research.google.com/github/Azaidi317/LLM-Finetuning-Projects/blob/main/Finetuning_Llama2_on_finance_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers datasets torch bitsandbytes peft accelerate wandb

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting wandb
  Downloading wandb-0.18.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.7-cp310-cp310-manylinux_2_17_x86_6

In [16]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import pandas as pd
from tqdm import tqdm
import wandb
from sklearn.model_selection import train_test_split
import os
os.environ["HUGGINGFACE_TOKEN"] = "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"

class EarningsCallTrainer:
    def __init__(self, hf_token):
        self.hf_token = hf_token
        self.base_model = "meta-llama/Llama-2-7b-chat-hf"
        self.max_length = 2048

        print("Initializing tokenizer and model...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.base_model,
            use_auth_token=hf_token,
            padding_side="right",
            use_fast=False
        )

        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False
        )

    def prepare_earnings_data(self):
        """Load and prepare earnings call dataset"""
        print("Loading dataset...")
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        def format_instruction(example):
            metadata = example.get('header', {})
            company = metadata.get('company_name', 'Unknown')
            date = metadata.get('date', 'Unknown')
            transcript = example.get('content', '')[:3000]

            formatted_text = f"""[INST] Analyze this earnings call transcript:
Company: {company}
Date: {date}

Transcript excerpt:
{transcript}

Analyze the following aspects:
1. Key financial metrics
2. Business performance
3. Future outlook
4. Risk factors
5. Overall sentiment
[/INST]"""

            return {"text": formatted_text}

        print("Formatting dataset...")
        formatted_dataset = dataset.map(
            format_instruction,
            remove_columns=dataset['train'].column_names
        )

        return formatted_dataset

    def prepare_model(self):
        """Prepare model for training"""
        print("Loading base model...")
        model = AutoModelForCausalLM.from_pretrained(
            self.base_model,
            quantization_config=self.bnb_config,
            device_map="auto",
            use_auth_token=self.hf_token,
            torch_dtype=torch.float16
        )

        # Prepare for k-bit training
        model = prepare_model_for_kbit_training(model)

        # Configure LoRA
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            bias="none",
            lora_dropout=0.05,
            task_type=TaskType.CAUSAL_LM
        )

        # Get PEFT model
        model = get_peft_model(model, lora_config)

        return model

    def train(self,
              train_dataset,
              output_dir="./llama2_earnings_model",
              epochs=3,
              batch_size=4,
              gradient_accumulation_steps=4):
        """Train the model"""
        # Initialize wandb for tracking
        wandb.init(project="llama2-earnings-calls")

        # Prepare training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            weight_decay=0.01,
            logging_steps=10,
            save_strategy="epoch",
            learning_rate=2e-4,
            fp16=True,
            optim="paged_adamw_8bit",
            max_grad_norm=0.3,
            max_steps=-1,
            group_by_length=True,
            report_to="wandb"
        )

        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=self.data_collator
        )

        print("Starting training...")
        trainer.train()

        # Save model
        print("Saving model...")
        trainer.save_model(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        return output_dir

    def data_collator(self, features):
        """Collate data for training"""
        batch = {
            "input_ids": torch.tensor([f["input_ids"] for f in features], dtype=torch.long),
            "attention_mask": torch.tensor([f["attention_mask"] for f in features], dtype=torch.long),
            "labels": torch.tensor([f["input_ids"] for f in features], dtype=torch.long),
        }
        return batch

    def evaluate_model(self, test_text):
        """Generate analysis for test text"""
        inputs = self.tokenizer(
            test_text,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        ).to(self.model.device)

        outputs = self.model.generate(
            **inputs,
            max_length=2000,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

if __name__ == "__main__":
    import os
    from transformers import TrainingArguments

    # Initialize with your token
    HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
    if not HF_TOKEN:
        raise ValueError("Please set HUGGINGFACE_TOKEN")

    # Initialize trainer
    trainer = EarningsCallTrainer(hf_token=HF_TOKEN)

    # Prepare dataset
    dataset = trainer.prepare_earnings_data()

    # Prepare model
    model = trainer.prepare_model()

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./llama2_earnings_model",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=10,
        save_strategy="epoch",
        learning_rate=2e-4,
        fp16=True,
        optim="paged_adamw_8bit",
        max_grad_norm=0.3,
        max_steps=-1,
        group_by_length=True
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        data_collator=trainer.data_collator
    )

    # Start training
    print("Starting training...")
    trainer.train()

    # Save model
    output_dir = "./llama2_earnings_final"
    trainer.save_model(output_dir)
    trainer.tokenizer.save_pretrained(output_dir)
    print(f"Model saved to: {output_dir}")

Initializing tokenizer and model...
Loading dataset...
Formatting dataset...
Loading base model...




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Starting training...


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [text]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [19]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

class FinancialLlamaTrainer:
    def __init__(self, hf_token):
        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
        self.hf_token = hf_token

        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=hf_token,
            trust_remote_code=True
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Configure 4-bit quantization
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    def load_model(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            use_auth_token=self.hf_token,
            device_map="auto",
            trust_remote_code=True
        )

        # Prepare model for training
        model = prepare_model_for_kbit_training(model)

        # LoRA configuration
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        return get_peft_model(model, lora_config)

    def prepare_dataset(self):
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        def format_sample(example):
            header = example.get('header', {})
            company = header.get('company_name', 'Unknown')
            date = header.get('date', 'Unknown')
            content = example.get('content', '')[:3000]

            return {
                "text": f"[INST] Analyze this earnings call transcript:\nCompany: {company}\nDate: {date}\n\nTranscript:\n{content}\n\nProvide a comprehensive analysis including key metrics, business performance, outlook, and risks. [/INST]"
            }

        return dataset.map(format_sample)

    def train(self, training_args):
        # Load model and dataset
        model = self.load_model()
        dataset = self.prepare_dataset()

        # Initialize trainer
        trainer = Trainer(
            model=model,
            train_dataset=dataset['train'],
            args=training_args,
            data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['text']) for f in data])}
        )

        trainer.train()
        return trainer

if __name__ == "__main__":
    # Get Hugging Face token
    hf_token = os.getenv("HUGGINGFACE_TOKEN")
    if not hf_token:
        raise ValueError("Set HUGGINGFACE_TOKEN environment variable")

    # Initialize trainer
    trainer = FinancialLlamaTrainer(hf_token=hf_token)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./llama2_financial",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        optim="paged_adamw_8bit"
    )

    # Train model
    trainer = trainer.train(training_args)

    # Save model
    trainer.save_model("./llama2_financial_final")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [21]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

class FinancialLlamaTrainer:
    def __init__(self, hf_token):
        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
        self.hf_token = hf_token

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=hf_token
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    def load_model(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            use_auth_token=self.hf_token,
            device_map="auto"
        )

        model = prepare_model_for_kbit_training(model)

        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        return get_peft_model(model, lora_config)

    def prepare_dataset(self):
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        def format_and_tokenize(example):
            header = example.get('header', {})
            company = header.get('company_name', 'Unknown')
            date = header.get('date', 'Unknown')
            content = example.get('content', '')[:3000]

            text = f"[INST] Analyze this earnings call transcript:\nCompany: {company}\nDate: {date}\n\nTranscript:\n{content}\n\nProvide a comprehensive analysis including key metrics, business performance, outlook, and risks. [/INST]"

            # Tokenize the text
            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=2048,
                padding="max_length"
            )

            return tokenized

        tokenized_dataset = dataset.map(
            format_and_tokenize,
            remove_columns=dataset['train'].column_names
        )

        return tokenized_dataset

    def train(self, training_args):
        model = self.load_model()
        dataset = self.prepare_dataset()

        # Use proper data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False
        )

        trainer = Trainer(
            model=model,
            train_dataset=dataset['train'],
            args=training_args,
            data_collator=data_collator
        )

        trainer.train()
        return trainer

if __name__ == "__main__":
    hf_token = os.getenv("HUGGINGFACE_TOKEN")
    if not hf_token:
        raise ValueError("Set HUGGINGFACE_TOKEN environment variable")

    trainer = FinancialLlamaTrainer(hf_token=hf_token)

    training_args = TrainingArguments(
        output_dir="./llama2_financial",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        optim="paged_adamw_8bit",
        report_to="none"
    )

    trainer = trainer.train(training_args)
    trainer.save_model("./llama2_financial_final")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.3061
20,2.6488
30,1.4835
40,0.4
50,0.0234
60,0.0
70,0.0


KeyboardInterrupt: 

In [5]:
pip install GPUtil

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=a6748185f6c878c085396d233f5716236a4a73e845f617bdb2ccb80f033f606f
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0


In [8]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
from datetime import datetime
import psutil
import GPUtil
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
import time

os.environ["HUGGINGFACE_TOKEN"] = "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"

class MemoryMonitor:
    def __init__(self):
        self.gpu_usage = []
        self.cpu_usage = []
        self.ram_usage = []
        self.timestamps = []

    def measure(self):
        # GPU Memory
        gpu = GPUtil.getGPUs()[0]
        self.gpu_usage.append(gpu.memoryUsed)

        # CPU and RAM
        self.cpu_usage.append(psutil.cpu_percent())
        self.ram_usage.append(psutil.Process().memory_info().rss / 1024 / 1024 / 1024)  # GB
        self.timestamps.append(time.time())

    def plot(self):
        plt.figure(figsize=(15, 5))

        # GPU Memory
        plt.subplot(1, 3, 1)
        plt.plot(self.timestamps, self.gpu_usage)
        plt.title('GPU Memory Usage')
        plt.ylabel('Memory (MB)')

        # CPU Usage
        plt.subplot(1, 3, 2)
        plt.plot(self.timestamps, self.cpu_usage)
        plt.title('CPU Usage')
        plt.ylabel('Usage %')

        # RAM Usage
        plt.subplot(1, 3, 3)
        plt.plot(self.timestamps, self.ram_usage)
        plt.title('RAM Usage')
        plt.ylabel('Usage (GB)')

        plt.tight_layout()
        plt.savefig('memory_usage.png')
        plt.close()

class TrainingMonitor:
    def __init__(self):
        self.training_loss = []
        self.validation_loss = []
        self.learning_rates = []

    def update(self, logs):
        if 'loss' in logs:
            self.training_loss.append(logs['loss'])
        if 'eval_loss' in logs:
            self.validation_loss.append(logs['eval_loss'])
        if 'learning_rate' in logs:
            self.learning_rates.append(logs['learning_rate'])

    def plot(self):
        plt.figure(figsize=(15, 5))

        # Training Loss
        plt.subplot(1, 3, 1)
        plt.plot(self.training_loss)
        plt.title('Training Loss')
        plt.ylabel('Loss')

        # Validation Loss
        if self.validation_loss:
            plt.subplot(1, 3, 2)
            plt.plot(self.validation_loss)
            plt.title('Validation Loss')
            plt.ylabel('Loss')

        # Learning Rate
        plt.subplot(1, 3, 3)
        plt.plot(self.learning_rates)
        plt.title('Learning Rate')
        plt.ylabel('LR')

        plt.tight_layout()
        plt.savefig('training_progress.png')
        plt.close()

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_monitor = MemoryMonitor()
        self.training_monitor = TrainingMonitor()

    def training_step(self, *args, **kwargs):
        self.memory_monitor.measure()
        logs = super().training_step(*args, **kwargs)
        self.training_monitor.update(logs)
        return logs

    def save_metrics(self, *args, **kwargs):
        super().save_metrics(*args, **kwargs)
        self.memory_monitor.plot()
        self.training_monitor.plot()

class FinancialLlamaTrainer:
    # ... [Previous initialization code remains the same]

    def train(self, training_args):
        model = self.load_model()
        dataset = self.prepare_dataset()

        # Add early stopping
        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01
        )

        # Use custom trainer with monitoring
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer,
                mlm=False
            ),
            callbacks=[early_stopping]
        )

        print("Starting training...")
        trainer.train()

        return trainer

    def generate_sample_analysis(self, model, text):
        """Generate analysis using the fine-tuned model"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=self.max_length)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

if __name__ == "__main__":
    # Get Hugging Face token
    hf_token = os.getenv("HUGGINGFACE_TOKEN")
    if not hf_token:
        raise ValueError("Please set HUGGINGFACE_TOKEN environment variable")

    # Initialize trainer
    trainer = FinancialLlamaTrainer(hf_token="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")

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./llama2_financial_{datetime.now().strftime('%Y%m%d_%H%M')}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        optim="paged_adamw_8bit",
        report_to="none"
    )

    # Start training
    trainer = trainer.train(training_args)

    # Save final model
    output_dir = f"./llama2_financial_final_{datetime.now().strftime('%Y%m%d_%H%M')}"
    trainer.save_model(output_dir)
    trainer.tokenizer.save_pretrained(output_dir)

    # Test the model
    test_text = """[INST] Analyze this earnings call transcript:
    Company: Example Corp
    Date: 2024-Q1

    Transcript:
    Our Q1 revenue grew 15% year-over-year to $1.2B. Operating margin improved to 28%.
    We're seeing strong demand in cloud services and AI solutions. However, supply chain
    challenges persist in our hardware division.
    [/INST]"""

    analysis = trainer.generate_sample_analysis(trainer.model, test_text)
    print("\nSample Analysis:")
    print(analysis)

    # Print training summary
    print("\nTraining Summary:")
    print(f"Final training loss: {trainer.state.log_history[-1]['loss']:.4f}")
    print(f"Best validation loss: {trainer.state.best_metric:.4f}")
    print(f"Training time: {trainer.state.total_flos/1e12:.2f} TFLOPS")

    # Save visualizations
    print("\nVisualizations saved:")
    print("- memory_usage.png")
    print("- training_progress.png")

TypeError: FinancialLlamaTrainer() takes no arguments

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datetime import datetime

class FinancialLlamaTrainer:
    def __init__(self):
        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
        # Hardcode your token here
        self.hf_token = "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"  # Replace with your token
        self.max_length = 2048

        print("Initializing tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_token
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    def load_model(self):
        print("Loading base model...")
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            use_auth_token=self.hf_token,
            device_map="auto",
            torch_dtype=torch.float16
        )

        print("Preparing model for training...")
        model = prepare_model_for_kbit_training(model)

        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        print("Applying LoRA...")
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

        return model

    def prepare_dataset(self):
        print("Loading dataset...")
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        print("Processing dataset...")
        def format_and_tokenize(example):
            header = example.get('header', {})
            company = header.get('company_name', 'Unknown')
            date = header.get('date', 'Unknown')
            content = example.get('content', '')

            if not content:
                return None

            text = f"""[INST] Analyze this earnings call transcript:
            Company: {company}
            Date: {date}

            Transcript:
            {content[:3000]}

            Provide a comprehensive analysis including:
            1. Key financial metrics
            2. Business performance
            3. Future outlook
            4. Risk factors
            5. Market position
            [/INST]"""

            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            )

            tokenized["labels"] = tokenized["input_ids"].copy()
            return tokenized

        processed_dataset = dataset.map(
            format_and_tokenize,
            remove_columns=dataset['train'].column_names,
            desc="Processing dataset"
        )

        split_dataset = processed_dataset["train"].train_test_split(
            test_size=0.1,
            seed=42
        )

        print(f"Dataset prepared: {len(split_dataset['train'])} training samples")
        return split_dataset

    def train(self, training_args):
        model = self.load_model()
        dataset = self.prepare_dataset()

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=data_collator
        )

        print("Starting training...")
        trainer.train()

        return trainer

# Main execution
if __name__ == "__main__":
    # Initialize trainer
    trainer = FinancialLlamaTrainer()

    # Training arguments
    training_args = TrainingArguments(
      output_dir=f"./llama2_financial_{datetime.now().strftime('%Y%m%d_%H%M')}",
      num_train_epochs=3,
      per_device_train_batch_size=8,    # For A100
      per_device_eval_batch_size=8,
      gradient_accumulation_steps=2,
      warmup_steps=50,
      learning_rate=2e-4,
      fp16=True,
      logging_steps=10,
      # Match evaluation and save strategies
      evaluation_strategy="steps",
      eval_steps=100,                   # Evaluate every 100 steps
      save_strategy="steps",            # Changed from "epoch" to "steps"
      save_steps=100,                   # Save every 100 steps
      save_total_limit=2,              # Keep only last 2 checkpoints
      load_best_model_at_end=True,
      metric_for_best_model="loss",
      greater_is_better=False,
      optim="paged_adamw_8bit",
      report_to="none"
)

    # Start training
    trained_model = trainer.train(training_args)

    # Save final model
    output_dir = f"./llama2_financial_final_{datetime.now().strftime('%Y%m%d_%H%M')}"
    trained_model.save_model(output_dir)
    print(f"Model saved to: {output_dir}")

RuntimeError: Failed to import transformers.training_args because of the following error (look up to see its traceback):
generic_type: type "XlaShardingSpec" is already registered!

In [5]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datetime import datetime

class FinancialLlamaTrainer:
    def __init__(self):
        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
        self.hf_token = "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"  # Replace with your token
        self.max_length = 2048

        print("Initializing tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_token
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

    def load_model(self):
        print("Loading base model...")
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            use_auth_token=self.hf_token,
            device_map="auto",
            torch_dtype=torch.float16
        )

        model = prepare_model_for_kbit_training(model)

        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        model = get_peft_model(model, lora_config)
        return model

    # Only showing the modified parts that need fixing

    def prepare_dataset(self):
        print("Loading dataset...")
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        def format_and_tokenize(example):
            try:
                # Extract text from sentence field
                sentence = example.get('sentence', '')
                if not sentence:
                    return None

                # Format the prompt
                text = f"""[INST] Analyze this financial statement:

Text: {sentence}

Analyze the sentiment and implications for the company's performance.
[/INST]"""

                # Tokenize the text
                encodings = self.tokenizer(
                    text,
                    truncation=True,
                    max_length=self.max_length,
                    padding='max_length',
                    return_tensors=None  # Important: don't return tensors yet
                )

                # Add labels
                encodings['labels'] = encodings['input_ids'].copy()

                return encodings

            except Exception as e:
                print(f"Error processing example: {str(e)}")
                return None

        # Process dataset
        print("Processing dataset...")
        processed_dataset = dataset.map(
            format_and_tokenize,
            remove_columns=dataset['train'].column_names,
            desc="Processing dataset"
        )

        # Verify the structure
        if len(processed_dataset['train']) > 0:
            sample = processed_dataset['train'][0]
            print("\nSample data structure:", list(sample.keys()))
            print(f"Input IDs length: {len(sample['input_ids'])}")

        # Split dataset
        split_dataset = processed_dataset["train"].train_test_split(
            test_size=0.1,
            seed=42
        )

        print(f"Dataset prepared: {len(split_dataset['train'])} training samples")
        return split_dataset

    def train(self, training_args):
        model = self.load_model()
        dataset = self.prepare_dataset()

        def collate_fn(examples):
            return {
                'input_ids': torch.tensor([ex['input_ids'] for ex in examples]),
                'attention_mask': torch.tensor([ex['attention_mask'] for ex in examples]),
                'labels': torch.tensor([ex['labels'] for ex in examples])
            }

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=collate_fn
        )

        print("Starting training...")
        return trainer.train()





# Main execution
if __name__ == "__main__":
    trainer = FinancialLlamaTrainer()

    training_args = TrainingArguments(
        output_dir=f"./llama2_financial_{datetime.now().strftime('%Y%m%d_%H%M')}",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        optim="paged_adamw_8bit",
        report_to="none"
    )

    # Train
    trained_model = trainer.train(training_args)

    # Save
    output_dir = f"./llama2_financial_final_{datetime.now().strftime('%Y%m%d_%H%M')}"
    trained_model.save_model(output_dir)
    print(f"Model saved to: {output_dir}")

RuntimeError: Failed to import transformers.training_args because of the following error (look up to see its traceback):
generic_type: type "XlaShardingSpec" is already registered!

In [1]:
# First, we need to install the correct versions in this specific order
!pip uninstall -y torch torch_xla
!pip install torch==2.1.0 torch_xla[tpu]==2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html
!pip install transformers datasets peft accelerate

import os
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model

class TPULlamaTrainer:
    def __init__(self):
        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
        self.hf_token = "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"  # Replace with your token
        self.max_length = 2048

        print("Setting up TPU...")
        self.device = xm.xla_device()

        print("Initializing tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_token
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def prepare_model(self):
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_token,
            torch_dtype=torch.bfloat16
        )

        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        model = get_peft_model(model, lora_config)
        model = model.to(self.device)
        return model

    def prepare_dataset(self):
        print("Loading dataset...")
        dataset = load_dataset("JanosAudran/financial-reports-sec", 'small_lite')

        def format_and_tokenize(example):
            sentence = example.get('sentence', '')
            if not sentence:
                return None

            text = f"""[INST] Analyze this financial statement:
Text: {sentence}
Analyze the sentiment and implications for the company's performance.
[/INST]"""

            return self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding='max_length'
            )

        print("Processing dataset...")
        processed_dataset = dataset.map(
            format_and_tokenize,
            remove_columns=dataset['train'].column_names
        )

        return processed_dataset["train"].train_test_split(test_size=0.1, seed=42)

# Main execution
def main():
    # Initialize trainer
    trainer = TPULlamaTrainer()

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./llama2_financial_tpu",
        num_train_epochs=3,
        per_device_train_batch_size=8,     # Adjusted for TPU
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        learning_rate=2e-4,
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        bf16=True,                         # Use bfloat16 for TPU
        report_to="none",
        dataloader_drop_last=True          # Important for TPU
    )

    # Prepare model and dataset
    model = trainer.prepare_model()
    dataset = trainer.prepare_dataset()

    # Initialize trainer
    tpu_trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"]
    )

    # Train
    tpu_trainer.train()

    # Save model
    tpu_trainer.save_model("./llama2_financial_tpu_final")

if __name__ == "__main__":
    # Verify TPU setup
    print("TPU Device:", xm.xla_device())
    print("XLA Version:", torch_xla.__version__)

    # Run training
    main()

Found existing installation: torch 2.1.0
Uninstalling torch-2.1.0:
  Successfully uninstalled torch-2.1.0
Found existing installation: torch-xla 2.1.0
Uninstalling torch-xla-2.1.0:
  Successfully uninstalled torch-xla-2.1.0
Looking in links: https://storage.googleapis.com/libtpu-releases/index.html
Collecting torch==2.1.0
  Using cached torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torch_xla==2.1.0 (from torch_xla[tpu]==2.1.0)
  Using cached torch_xla-2.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (7.3 kB)
Using cached torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
Using cached torch_xla-2.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (81.1 MB)
Installing collected packages: torch, torch_xla
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.5.1+cpu requires torch==2.5.1, but you have torch 2.1.0 which is i

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
module 'torch.library' has no attribute 'register_fake'