In [1]:
import sys
import os

# Add the project's root directory to the Python path
# This allows the notebook to find the 'src' module
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
# --- 1. Setup and Imports ---
import torch
import os
import pandas as pd
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from dotenv import load_dotenv


# Ensure CUDA is available
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
print(f"CUDA available: {torch.cuda.is_available()}. Using GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True. Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [5]:
# --- 2. Configuration ---
# --- Define the project root directory ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Use the same base model as before
BASE_MODEL_ID = "chuanli11/Llama-3.2-3B-Instruct-uncensored"

# Define a new path for our specialized sentiment model
PROCESSED_MODEL_PATH = os.path.join(
    PROJECT_ROOT, "models", "processed", f"{BASE_MODEL_ID.replace('/', '_')}-sentiment-v1"
)
INITIAL_MODEL_PATH = os.path.join(PROJECT_ROOT, "models", "initial", f"{BASE_MODEL_ID.replace('/', '_')}")

# Stock to analyze
TICKER = "NVDA"
STOCK_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "processed", f"{TICKER}_processed_data.csv")

# News API key (load from your .env file)
load_dotenv()
NEWS_API_KEY = os.getenv("NEWS_API_KEY")

# --- Dynamic Article Fetching Logic ---
# Map volatility levels to the number of articles to fetch
# This is how we give more weight to significant time periods
VOLATILITY_TO_ARTICLE_COUNT = {
    "low": 10,
    "medium": 25,
    "high": 50,
}

In [6]:
# --- 3. Load Stock Data & Identify Volatility ---
df_stock = pd.read_csv(STOCK_DATA_PATH)
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

# Calculate daily price change percentage
df_stock['price_change_pct'] = df_stock['Close'].pct_change().abs() * 100

# Define volatility levels based on the quantiles of price changes
low_thresh, high_thresh = df_stock['price_change_pct'].quantile([0.25, 0.75])

def get_volatility_level(change):
    if change >= high_thresh:
        return 'high'
    elif change >= low_thresh:
        return 'medium'
    else:
        return 'low'

df_stock['volatility'] = df_stock['price_change_pct'].apply(get_volatility_level)

# Create a lookup dictionary for quick access
volatility_map = df_stock.set_index('Date')['volatility'].to_dict()

print("Volatility levels calculated for each trading day.")
print(df_stock[['Date', 'Close', 'price_change_pct', 'volatility']].tail())

Volatility levels calculated for each trading day.
          Date       Close  price_change_pct volatility
727 2025-09-17  170.289993          2.624663     medium
728 2025-09-18  176.240005          3.494047       high
729 2025-09-19  176.669998          0.243981        low
730 2025-09-22  183.610001          3.928229       high
731 2025-09-23  178.429993          2.821201     medium


In [7]:
# --- 4. Dynamically Scrape News Data ---
newsapi = NewsApiClient(api_key=NEWS_API_KEY)
all_headlines = []

# We'll work with the last 30 days of data due to API limitations
thirty_days_ago = datetime.now() - timedelta(days=29)
date_range = pd.to_datetime(df_stock[df_stock['Date'] >= thirty_days_ago]['Date'])

for date in date_range:
    date_str = date.strftime('%Y-%m-%d')
    volatility = volatility_map.get(date, 'low')
    article_count = VOLATILITY_TO_ARTICLE_COUNT[volatility]
    
    print(f"Fetching {article_count} articles for {date_str} (Volatility: {volatility})...")
    
    try:
        articles = newsapi.get_everything(
            q=TICKER,
            from_param=date_str,
            to=date_str,
            language='en',
            sort_by='relevancy',
            page_size=article_count
        )
        for article in articles['articles']:
            all_headlines.append({'date': date, 'headline': article['title']})
    except Exception as e:
        print(f"Could not fetch news for {date_str}. Error: {e}")

df_news = pd.DataFrame(all_headlines)
print(f"\nSuccessfully fetched a total of {len(df_news)} headlines.")

Fetching 10 articles for 2025-08-28 (Volatility: low)...
Fetching 50 articles for 2025-08-29 (Volatility: high)...
Fetching 25 articles for 2025-09-02 (Volatility: medium)...
Fetching 10 articles for 2025-09-03 (Volatility: low)...
Fetching 10 articles for 2025-09-04 (Volatility: low)...
Fetching 25 articles for 2025-09-05 (Volatility: medium)...
Fetching 10 articles for 2025-09-08 (Volatility: low)...
Fetching 25 articles for 2025-09-09 (Volatility: medium)...
Fetching 50 articles for 2025-09-10 (Volatility: high)...
Fetching 10 articles for 2025-09-11 (Volatility: low)...
Fetching 10 articles for 2025-09-12 (Volatility: low)...
Fetching 10 articles for 2025-09-15 (Volatility: low)...
Fetching 25 articles for 2025-09-16 (Volatility: medium)...
Fetching 25 articles for 2025-09-17 (Volatility: medium)...
Fetching 50 articles for 2025-09-18 (Volatility: high)...
Fetching 10 articles for 2025-09-19 (Volatility: low)...
Fetching 50 articles for 2025-09-22 (Volatility: high)...
Fetching 25 

In [8]:
# --- 5. Label Data (Sentiment Score & Price Impact) ---
# First, load the base model (re-using code from the previous notebook)
# Ensure your model and tokenizer from the previous steps are loaded
# For brevity, assuming 'model' and 'tokenizer' are already loaded and ready

labeled_headlines = []
for index, row in df_news.iterrows():
    headline = row['headline']
    
    # --- A. Get Sentiment Score via Zero-Shot ---
    labeling_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
On a scale from -1.0 (extremely negative) to 1.0 (extremely positive), what is the sentiment of this financial headline? Respond with only the number.
HEADLINE: {headline}
SENTIMENT_SCORE:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    
    # This is a conceptual inference call.
    # inputs = tokenizer(labeling_prompt, return_tensors="pt").to("cuda")
    # outputs = model.generate(**inputs, max_new_tokens=5)
    # sentiment_score_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # For now, we'll simulate this with a placeholder
    sentiment_score = 0.0 # Replace with actual model inference
    
    # --- B. Get Next Day's Price Change ---
    # Find the stock data for the day of the headline
    stock_row_index = df_stock[df_stock['Date'] == row['date']].index
    if not stock_row_index.empty:
        next_day_index = stock_row_index[0] + 1
        if next_day_index < len(df_stock):
            next_day_change = df_stock.iloc[next_day_index]['Close'] / df_stock.iloc[stock_row_index[0]]['Close'] - 1
            next_day_change_pct = next_day_change * 100
            
            labeled_headlines.append({
                'headline': headline,
                'sentiment_score': sentiment_score,
                'next_day_price_change': next_day_change_pct
            })

df_labeled = pd.DataFrame(labeled_headlines)
print("Data labeling complete.")
df_labeled.head()

Data labeling complete.


Unnamed: 0,headline,sentiment_score,next_day_price_change
0,Did Nvidia Just Pop an AI Bubble? Here’s What ...,0.0,-3.324642
1,5 biggest takeaways from the Nvidia Q2 earning...,0.0,-3.324642
2,"QBTS or RGTI: Which Stock Will Win As IBM, MSF...",0.0,-3.324642
3,Nvidia Stock Down 1.2%. Why Growth May Slow An...,0.0,-3.324642
4,"Nvidia’s Q2 Growth Masks China Headwinds, But ...",0.0,-3.324642


In [9]:
# --- 6. Format Prompts for Fine-Tuning ---
def format_sentiment_prompt(row):
    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Analyze the following financial headline. Provide its sentiment score from -1.0 to 1.0 and predict the stock's percentage change for the next trading day.
HEADLINE: {row['headline']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
SENTIMENT_SCORE: {row['sentiment_score']:.2f}, NEXT_DAY_PRICE_CHANGE: {row['next_day_price_change']:.2f}%<|eot_id|>"""
    return {"text": prompt}

prompts = df_labeled.apply(format_sentiment_prompt, axis=1).tolist()
#dataset = Dataset.from_list([{'text': p} for p in prompts])
dataset = Dataset.from_list(prompts)

print(f"Created {len(dataset)} training prompts for the sentiment model.")
print("\nExample sentiment prompt:")
print(dataset[0]['text'])

Created 391 training prompts for the sentiment model.

Example sentiment prompt:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Analyze the following financial headline. Provide its sentiment score from -1.0 to 1.0 and predict the stock's percentage change for the next trading day.
HEADLINE: Did Nvidia Just Pop an AI Bubble? Here’s What the Market Says<|eot_id|><|start_header_id|>assistant<|end_header_id|>
SENTIMENT_SCORE: 0.00, NEXT_DAY_PRICE_CHANGE: -3.32%<|eot_id|>


In [10]:
# --- 7. Load Model and Configure Trainer ---

# --- Part A: Load Base Model and Tokenizer (from the previous notebook) ---
print(f"Loading base model '{BASE_MODEL_ID}' directly from Hugging Face with quantization...")

# QLoRA configuration using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the base model directly from the Hub with all optimizations
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
print("Base model and tokenizer loaded successfully.")


# --- Part B: Configure PEFT (LoRA) ---
# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)
print("PEFT (LoRA) adapters added to the model.")


# --- Part C: Configure Trainer ---
# This is where we configure checkpointing
training_args = TrainingArguments(
    output_dir=PROCESSED_MODEL_PATH,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    num_train_epochs=3,
    lr_scheduler_type="constant",
    warmup_ratio=0.03,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    bf16=True,
    remove_unused_columns=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=lambda data: {'input_ids': torch.stack([tokenizer(sample['text'], return_tensors="pt").input_ids.squeeze(0) for sample in data]),
                                 'labels': torch.stack([tokenizer(sample['text'], return_tensors="pt").input_ids.squeeze(0) for sample in data])}
)
print("Trainer configured and ready.")

Loading base model 'chuanli11/Llama-3.2-3B-Instruct-uncensored' directly from Hugging Face with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model and tokenizer loaded successfully.
PEFT (LoRA) adapters added to the model.


  trainer = Trainer(


Trainer configured and ready.


In [11]:
from transformers.trainer_utils import get_last_checkpoint

In [12]:
last_checkpoint = get_last_checkpoint(PROCESSED_MODEL_PATH)

print(f"Starting training...")
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")

# The trainer will now start fresh if last_checkpoint is None, 
# or resume from the path if a checkpoint is found.
trainer.train(resume_from_checkpoint=last_checkpoint)

print("Training complete.")

# --- Save the final model adapter ---
final_model_path = os.path.join(PROCESSED_MODEL_PATH, "final_model")
trainer.save_model(final_model_path)

print(f"Final fine-tuned model adapters saved to: {final_model_path}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.0429
20,1.0073
30,0.9222
40,0.7942
50,0.6954
60,0.656
70,0.6035
80,0.6297
90,0.6214
100,0.6663


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training complete.
Final fine-tuned model adapters saved to: /home/anonion/ftmodel/models/processed/chuanli11_Llama-3.2-3B-Instruct-uncensored-sentiment-v1/final_model
