In [1]:
import sys
import os

# Add the project's root directory to the Python path
# This allows the notebook to find the 'src' module
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
# --- 1. Setup and Imports ---
# Make sure you've installed everything from req.txt

import torch
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Import your custom data scraper
from src.data_ingestion.scraper import get_stock_data

# Ensure CUDA is available
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
print(f"CUDA available: {torch.cuda.is_available()}. Using GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True. Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [3]:
# --- 2. Configuration ---

# The finalized Hugging Face model ID
BASE_MODEL_ID = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Directory to save the original, untouched model
INITIAL_MODEL_PATH = os.path.join(PROJECT_ROOT, "models", "initial", f"{BASE_MODEL_ID.replace('/', '_')}")
# Directory to save the fine-tuned adapters and training checkpoints
PROCESSED_MODEL_PATH = os.path.join(PROJECT_ROOT, "models", "processed", f"{BASE_MODEL_ID.replace('/', '_')}-timeseries-v1")

# Stock to train on
STOCK_YFINANCE_LINK = "https://finance.yahoo.com/quote/NVDA"
DATA_PERIOD = "3y" 

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [5]:
print(f"Loading base model '{BASE_MODEL_ID}' directly from Hugging Face with quantization...")

# QLoRA configuration using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the base model directly from the Hub with all optimizations applied at once
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token # Set padding token for training

print("Model and tokenizer loaded successfully.")

Loading base model 'chuanli11/Llama-3.2-3B-Instruct-uncensored' directly from Hugging Face with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully.


In [6]:
# --- 4. Ingest and Prepare Data (IMPROVED VERSION) ---

# Fetch data for fine-tuning
df = get_stock_data(yfinance_link=STOCK_YFINANCE_LINK, period=DATA_PERIOD)
# Extract the ticker symbol for use in the prompt
ticker_symbol = STOCK_YFINANCE_LINK.split("/")[-1]

# --- Define the new, more advanced formatting function ---
def format_training_prompt(row, context_days=60, prediction_days=5):
    """
    Creates a rich text prompt with statistical context and a one-shot example.
    """
    current_index = row.name
    if current_index < context_days:
        return None
    
    prediction_end_index = current_index + prediction_days
    if prediction_end_index > len(df):
        return None
    
    # --- 1. Get Data Slices ---
    context_df = df.iloc[current_index - context_days : current_index]
    context_prices = context_df['Close'].tolist()
    prediction_prices = df.iloc[current_index : prediction_end_index]['Close'].tolist()

    # --- 2. Calculate Statistics (new) ---
    mean = context_df['Close'].mean()
    std_dev = context_df['Close'].std()
    trend = "upward" if context_prices[-1] > context_prices[0] else "downward"

    # --- 3. Format data into strings ---
    context_str = ", ".join([f"{p:.2f}" for p in context_prices])
    prediction_str = ", ".join([f"{p:.2f}" for p in prediction_prices])

    # --- 4. Create the Rich Prompt (new) ---
    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a financial analyst specializing in time series forecasting.

Below is an example of a forecast.
---
EXAMPLE:
TASK: Time Series Forecast.
STOCK: AAPL
STATISTICS: mean=175.30, std_dev=5.40, trend=upward
CONTEXT_DAYS: 3
DATA: [170.10, 172.50, 176.80]
Predict the next 2 closing prices.

PREDICTION: [178.20, 177.90]
---

Now, perform the following task.

TASK: Time Series Forecast.
STOCK: {ticker_symbol}
STATISTICS: mean={mean:.2f}, std_dev={std_dev:.2f}, trend={trend}
CONTEXT_DAYS: {context_days}
DATA: [{context_str}]
Predict the next {prediction_days} closing prices.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

PREDICTION: [{prediction_str}]<|eot_id|>"""
    
    return {"text": prompt}

# --- Create the dataset ---
prompts = df.apply(format_training_prompt, axis=1).dropna().tolist()
dataset = Dataset.from_list(prompts)
print(f"Created {len(dataset)} training prompts.")
print("\nExample of the new, richer prompt:")
print(dataset[0]['text'])

Starting data ingestion for link: https://finance.yahoo.com/quote/NVDA
Extracted ticker: NVDA
Successfully scraped 752 data points.
Minor preprocessing complete. (Date formatting, 20-day MA calculated)
Data temporarily saved to: data/processed/NVDA_processed_data.csv
Created 669 training prompts.

Example of the new, richer prompt:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a financial analyst specializing in time series forecasting.

Below is an example of a forecast.
---
EXAMPLE:
TASK: Time Series Forecast.
STOCK: AAPL
STATISTICS: mean=175.30, std_dev=5.40, trend=upward
CONTEXT_DAYS: 3
DATA: [170.10, 172.50, 176.80]
Predict the next 2 closing prices.

PREDICTION: [178.20, 177.90]
---

Now, perform the following task.

TASK: Time Series Forecast.
STOCK: NVDA
STATISTICS: mean=15.45, std_dev=1.44, trend=upward
CONTEXT_DAYS: 60
DATA: [12.45, 12.58, 13.25, 12.88, 13.16, 13.82, 13.48, 13.53, 13.20, 13.40, 14.14, 14.28, 14.58, 13.76, 15.73, 16.31, 16.28, 16.65, 15.89

In [7]:
# --- 5. Configure PEFT (LoRA) and Trainer ---

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=32, # Rank of the update matrices. Lower is smaller, faster, but less expressive.
    lora_alpha=32, # Alpha parameter for scaling.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Target modules can vary by model, you may need to experiment
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)

# --- Training Arguments ---
# This is where we configure checkpointing
training_args = TrainingArguments(
    output_dir=PROCESSED_MODEL_PATH,
    per_device_train_batch_size=1, # Keep this low for 4GB VRAM
    gradient_accumulation_steps=4, # Simulate a larger batch size
    learning_rate=2e-4,
    max_grad_norm=0.3,
    num_train_epochs=3, # Start with 1 epoch, you can increase later
    lr_scheduler_type="constant",
    warmup_ratio=0.03,
    logging_steps=25, # Log progress every 25 steps
    save_strategy="steps", # Enable saving based on steps
    save_steps=100, # Save a checkpoint every 100 steps
    save_total_limit=3, # Only keep the last 3 checkpoints
    bf16=True, # Use bfloat16 for training if your GPU supports it (Ampere series like RTX 3050 Ti does)
    remove_unused_columns=False,
)

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=lambda data: {'input_ids': torch.stack([tokenizer(sample['text'], return_tensors="pt").input_ids.squeeze(0) for sample in data]),
                                'labels': torch.stack([tokenizer(sample['text'], return_tensors="pt").input_ids.squeeze(0) for sample in data])}
)

  trainer = Trainer(


In [8]:
from transformers.trainer_utils import get_last_checkpoint

In [9]:
last_checkpoint = get_last_checkpoint(PROCESSED_MODEL_PATH)

print(f"Starting training...")
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")

# The trainer will now start fresh if last_checkpoint is None, 
# or resume from the path if a checkpoint is found.
trainer.train(resume_from_checkpoint=last_checkpoint)

print("Training complete.")

# --- Save the final model adapter ---
final_model_path = os.path.join(PROCESSED_MODEL_PATH, "final_model")
trainer.save_model(final_model_path)

print(f"Final fine-tuned model adapters saved to: {final_model_path}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
25,1.0949
50,0.7315
75,0.2378
100,0.0879
125,0.068
150,0.0634
175,0.0601
200,0.0558
225,0.058
250,0.0559


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training complete.
Final fine-tuned model adapters saved to: /home/anonion/ftmodel/models/processed/chuanli11_Llama-3.2-3B-Instruct-uncensored-timeseries-v1/final_model
