In [3]:
# Cell 1
!pip install nltk datasets transformers[torch] tokenizers evaluate rouge_score sentencepiece huggingface_hub scikit-learn matplotlib seaborn -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Cell 2: 
import nltk
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from datasets import Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

2025-11-04 08:14:48.614670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762244088.810187      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762244088.866981      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


GPU Available: True
GPU Name: Tesla P100-PCIE-16GB
GPU Memory: 17.06 GB


In [6]:
# Cell 3: Load and Prepare Dataset 
# Load data
df = pd.read_csv('/kaggle/input/stock-market-sentiment-analyzer/ai_stock_sentiment_5k.csv')

print(f"Total samples loaded: {len(df)}")
print(f"\nDataset columns: {df.columns.tolist()}")
print(f"\nFirst row:")
print(df.iloc[0])

# Extract sentiment label and reason from format: <senti>Good<reason>explanation
def extract_sentiment_label(sentiment_str):
    """Extract 'Good'/'Bad'/'Neutral' from '<senti>Good<reason>...' """
    try:
        return sentiment_str.split('<reason>')[0].replace('<senti>', '').strip()
    except:
        return 'Neutral'

def extract_reason(sentiment_str):
    """Extract reason from '<senti>Good<reason>...' """
    try:
        return sentiment_str.split('<reason>')[1].strip()
    except:
        return "No reason provided"

df['sentiment_label'] = df['sentiment'].apply(extract_sentiment_label)
df['reason'] = df['sentiment'].apply(extract_reason)

# Create target format: "Sentiment: Good. Reason: ..."
df['target_text'] = df.apply(
    lambda row: f"Sentiment: {row['sentiment_label']}. Reason: {row['reason']}", 
    axis=1
)

# Create input format
df['input_text'] = "Analyze the financial sentiment: " + df['source']

# Check distribution
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())

print(f"\nSample input-output pair:")
print(f"Input: {df['input_text'].iloc[0][:150]}...")
print(f"Target: {df['target_text'].iloc[0][:150]}...")

Total samples loaded: 4980

Dataset columns: ['source_name', 'source', 'sentiment']

First row:
source_name                                              Reuters
source         OpenAI announced Q1 2025 earnings that signifi...
sentiment      <senti>Good<reason>Strong earnings beat and ra...
Name: 0, dtype: object

Sentiment distribution:
sentiment_label
Good       2942
Bad        1636
Neutral     402
Name: count, dtype: int64

Sample input-output pair:
Input: Analyze the financial sentiment: OpenAI announced Q1 2025 earnings that significantly exceeded analyst expectations, driven by widespread adoption of ...
Target: Sentiment: Good. Reason: Strong earnings beat and raised guidance typically drive positive investor sentiment and stock price appreciation....


In [7]:
# Cell 4: Create Train/Test Split
from datasets import Dataset

# Create dataset dictionary
dataset_dict = {
    'input_text': df['input_text'].tolist(),
    'target_text': df['target_text'].tolist(),
    'sentiment_label': df['sentiment_label'].tolist()
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)

# Split: 70% train, 30% test
dataset = dataset.train_test_split(test_size=0.3, seed=42)

print(f"Dataset splits:")
print(f"  Training: {len(dataset['train'])} samples")
print(f"  Testing: {len(dataset['test'])} samples")

Dataset splits:
  Training: 3486 samples
  Testing: 1494 samples


In [8]:
# Cell 5: Load Model and Tokenizer
MODEL_NAME = "google/flan-t5-base"

print(f"Loading model: {MODEL_NAME}")

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print(f"Model loaded successfully")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

Loading model: google/flan-t5-base


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded successfully
Model parameters: 247,577,856
Tokenizer vocab size: 32100


In [9]:
# Cell 6: Data Preprocessing and Tokenization
# Prefix for the task
prefix = "Analyze the financial sentiment: "

def preprocess_function(examples):
    """Tokenize inputs and labels"""
    # The inputs are already prefixed in our dataset
    inputs = examples["input_text"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    
    # The labels are the target sentiment + reason
    labels = tokenizer(text_target=examples["target_text"], 
                       max_length=128,         
                       truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing datasets...")

# Apply preprocessing to entire dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

print("Tokenization complete")
print(f"Tokenized train samples: {len(tokenized_dataset['train'])}")
print(f"Tokenized test samples: {len(tokenized_dataset['test'])}")

Tokenizing datasets...


Map:   0%|          | 0/3486 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Tokenization complete
Tokenized train samples: 3486
Tokenized test samples: 1494


In [10]:
# Cell 7: Setup Evaluation Metrics (ROUGE and Custom)
# Download NLTK data
nltk.download("punkt", quiet=True)

# Load ROUGE metric
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    """Compute ROUGE scores and custom sentiment accuracy"""
    preds, labels = eval_preds
    
    # Decode predictions and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract sentiment accuracy
    def extract_sentiment(text):
        """Extract sentiment from generated text"""
        try:
            if 'Sentiment:' in text:
                sentiment = text.split('Sentiment:')[1].split('.')[0].strip()
                if sentiment in ['Good', 'Bad', 'Neutral']:
                    return sentiment
            # Fallback
            text_lower = text.lower()
            if 'good' in text_lower or 'positive' in text_lower:
                return 'Good'
            elif 'bad' in text_lower or 'negative' in text_lower:
                return 'Bad'
            else:
                return 'Neutral'
        except:
            return 'Neutral'
    
    pred_sentiments = [extract_sentiment(pred) for pred in decoded_preds]
    true_sentiments = [extract_sentiment(label) for label in decoded_labels]
    
    sentiment_accuracy = accuracy_score(true_sentiments, pred_sentiments)
    result['sentiment_accuracy'] = sentiment_accuracy
    
    return result

print("Evaluation metrics configured")
print("Metrics: ROUGE-1, ROUGE-2, ROUGE-L, ROUGE-Lsum, Sentiment Accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics configured
Metrics: ROUGE-1, ROUGE-2, ROUGE-L, ROUGE-Lsum, Sentiment Accuracy


In [11]:
# Cell 8: 
# Training hyperparameters
L_RATE = 3e-4
BATCH_SIZE = 8  
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2  
NUM_EPOCHS = 4

# Frequent checkpoints
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", 
    save_strategy="epoch",  # Checkpoint after each epoch
    learning_rate=L_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=SAVE_TOTAL_LIM,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=False,  # Disable for stability
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="sentiment_accuracy",
    greater_is_better=True,
    logging_steps=10,
    report_to="none",
    seed=42
)

print("Training configuration:")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {L_RATE}")
print(f"  Checkpoints will be saved after each epoch")
print(f"  Total checkpoints to keep: {SAVE_TOTAL_LIM}")

Training configuration:
  Epochs: 4
  Batch size: 8
  Learning rate: 0.0003
  Checkpoints will be saved after each epoch
  Total checkpoints to keep: 2


In [12]:
# Cell 8.5: Clean Up Storage
import shutil
import os

# Remove old results folder if exists
if os.path.exists('./results'):
    print("Removing old checkpoints...")
    shutil.rmtree('./results')
    print("Old checkpoints removed")

# Check available space
import subprocess
result = subprocess.run(['df', '-h', '/kaggle/working'], capture_output=True, text=True)
print("\nAvailable storage:")
print(result.stdout)


Available storage:
Filesystem      Size  Used Avail Use% Mounted on
/dev/loop1       20G   72K   20G   1% /kaggle/working



In [13]:
# Cell 9: Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer initialized successfully")
print(f"Training on {len(tokenized_dataset['train'])} samples")
print(f"Evaluating on {len(tokenized_dataset['test'])} samples")

Trainer initialized successfully
Training on 3486 samples
Evaluating on 1494 samples


In [14]:
# Cell 10: 
print("="*60)
print("Starting training...")
print("="*60)

# Train the model
train_result = trainer.train()

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training runtime: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Sentiment Accuracy
1,1.4183,1.271406,0.46881,0.278829,0.435336,0.435613,0.952477
2,1.2014,1.153397,0.483015,0.297761,0.452761,0.452864,0.951138
3,0.9911,1.122992,0.494488,0.306597,0.463262,0.463394,0.951807
4,0.8057,1.127825,0.494013,0.311405,0.464258,0.464223,0.959839


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



Training completed!
Training loss: 1.1665
Training runtime: 1012.91 seconds
Training samples per second: 13.77


In [15]:
# Cell 11: Save Final Model
final_model_path = "./final_finetuned_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Final model saved to: {final_model_path}")

Final model saved to: ./final_finetuned_model


In [16]:
# Cell 12: Evaluate on Test Set
print("="*60)
print("Evaluating on test set...")
print("="*60)

eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(f"  ROUGE-1: {eval_results['eval_rouge1']:.4f}")
print(f"  ROUGE-2: {eval_results['eval_rouge2']:.4f}")
print(f"  ROUGE-L: {eval_results['eval_rougeL']:.4f}")
print(f"  ROUGE-Lsum: {eval_results['eval_rougeLsum']:.4f}")
print(f"  Sentiment Accuracy: {eval_results['eval_sentiment_accuracy']:.4f}")
print(f"  Evaluation Loss: {eval_results['eval_loss']:.4f}")

Evaluating on test set...



Evaluation Results:
  ROUGE-1: 0.4940
  ROUGE-2: 0.3114
  ROUGE-L: 0.4643
  ROUGE-Lsum: 0.4642
  Sentiment Accuracy: 0.9598
  Evaluation Loss: 1.1278


In [17]:
# Cell 13: Generate Detailed Predictions
print("Generating predictions on test set...")

test_data = dataset['test']
predictions = []
true_labels = []
generated_texts = []

for i, example in enumerate(test_data):
    # Tokenize input
    inputs = tokenizer(example['input_text'], return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate prediction
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract sentiments
    def extract_sentiment(text):
        try:
            if 'Sentiment:' in text:
                sentiment = text.split('Sentiment:')[1].split('.')[0].strip()
                if sentiment in ['Good', 'Bad', 'Neutral']:
                    return sentiment
            text_lower = text.lower()
            if 'good' in text_lower or 'positive' in text_lower:
                return 'Good'
            elif 'bad' in text_lower or 'negative' in text_lower:
                return 'Bad'
            else:
                return 'Neutral'
        except:
            return 'Neutral'
    
    pred_sentiment = extract_sentiment(generated_text)
    true_sentiment = example['sentiment_label']
    
    predictions.append(pred_sentiment)
    true_labels.append(true_sentiment)
    generated_texts.append(generated_text)
    
    if (i + 1) % 10 == 0:
        print(f"  Processed {i + 1}/{len(test_data)} samples...")

print("\nPrediction generation complete")

Generating predictions on test set...
  Processed 10/1494 samples...
  Processed 20/1494 samples...
  Processed 30/1494 samples...
  Processed 40/1494 samples...
  Processed 50/1494 samples...
  Processed 60/1494 samples...
  Processed 70/1494 samples...
  Processed 80/1494 samples...
  Processed 90/1494 samples...
  Processed 100/1494 samples...
  Processed 110/1494 samples...
  Processed 120/1494 samples...
  Processed 130/1494 samples...
  Processed 140/1494 samples...
  Processed 150/1494 samples...
  Processed 160/1494 samples...
  Processed 170/1494 samples...
  Processed 180/1494 samples...
  Processed 190/1494 samples...
  Processed 200/1494 samples...
  Processed 210/1494 samples...
  Processed 220/1494 samples...
  Processed 230/1494 samples...
  Processed 240/1494 samples...
  Processed 250/1494 samples...
  Processed 260/1494 samples...
  Processed 270/1494 samples...
  Processed 280/1494 samples...
  Processed 290/1494 samples...
  Processed 300/1494 samples...
  Processed

In [19]:
# Cell 14: Calculate Performance Metrics
print("="*60)
print("PERFORMANCE METRICS")
print("="*60)

# Overall accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"\nOverall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    true_labels, predictions, labels=['Good', 'Bad', 'Neutral'], average=None, zero_division=0
)

print(f"\nPer-Class Metrics:")
for i, label in enumerate(['Good', 'Bad', 'Neutral']):
    print(f"  {label}:")
    print(f"    Precision: {precision[i]:.4f}")
    print(f"    Recall:    {recall[i]:.4f}")
    print(f"    F1-Score:  {f1[i]:.4f}")
    print(f"    Support:   {support[i]}")

# Macro and Weighted averages
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    true_labels, predictions, average='macro', zero_division=0
)
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
    true_labels, predictions, average='weighted', zero_division=0
)

print(f"\nAverage Metrics:")
print(f"  Macro F1:    {f1_macro:.4f}")
print(f"  Weighted F1: {f1_weighted:.4f}")

# Confusion Matrix
cm = confusion_matrix(true_labels, predictions, labels=['Good', 'Bad', 'Neutral'])
print(f"\nConfusion Matrix:")
print(cm)

PERFORMANCE METRICS

Overall Accuracy: 0.9585 (95.85%)

Per-Class Metrics:
  Good:
    Precision: 0.9730
    Recall:    0.9942
    F1-Score:  0.9835
    Support:   869
  Bad:
    Precision: 0.9571
    Recall:    0.9761
    F1-Score:  0.9665
    Support:   503
  Neutral:
    Precision: 0.8280
    Recall:    0.6311
    F1-Score:  0.7163
    Support:   122

Average Metrics:
  Macro F1:    0.8888
  Weighted F1: 0.9560

Confusion Matrix:
[[864   1   4]
 [  0 491  12]
 [ 24  21  77]]


In [20]:
# Cell 16: Show Sample Predictions
print("="*60)
print("SAMPLE PREDICTIONS (First 5 from Test Set)")
print("="*60)

for i in range(min(5, len(test_data))):
    print(f"\n{'='*60}")
    print(f"Sample {i+1}")
    print(f"{'='*60}")
    print(f"Input: {test_data[i]['input_text'][:200]}...")
    print(f"\nTrue Output: {test_data[i]['target_text'][:150]}...")
    print(f"\nPredicted Output: {generated_texts[i][:150]}...")
    print(f"\nTrue Sentiment: {true_labels[i]}")
    print(f"Predicted Sentiment: {predictions[i]}")
    print(f"Match: {'✓ Correct' if predictions[i] == true_labels[i] else '✗ Incorrect'}")

SAMPLE PREDICTIONS (First 5 from Test Set)

Sample 1
Input: Analyze the financial sentiment: AMD launches new Instinct MI400 series with performance claims surpassing NVIDIA's Blackwell chips in specific benchmarks. The chips feature advanced memory architectu...

True Output: Sentiment: Good. Reason: Competitive product launch and manufacturer support could help AMD gain market share in AI accelerators, positive for stock....

Predicted Output: Sentiment: Good. Reason: Competitive product launch and major customer commitments should boost AMD's market position and stock valuation....

True Sentiment: Good
Predicted Sentiment: Good
Match: ✓ Correct

Sample 2
Input: Analyze the financial sentiment: Intel announced disappointing Q1 2025 results and lowered full-year guidance, citing continued market share losses in data center chips to AMD and NVIDIA. The company'...

True Output: Sentiment: Bad. Reason: Ongoing competitive pressures and guidance reduction will likely lead to further mul

In [21]:
# Cell 17: Save Detailed Results
# Save predictions CSV
results_df = pd.DataFrame({
    'input': [ex['input_text'][:200] for ex in test_data],
    'true_output': [ex['target_text'] for ex in test_data],
    'predicted_output': generated_texts,
    'true_sentiment': true_labels,
    'predicted_sentiment': predictions,
    'correct': [p == t for p, t in zip(predictions, true_labels)]
})

results_df.to_csv('model_predictions.csv', index=False)
print("Detailed predictions saved to 'model_predictions.csv'")

# Save metrics summary
metrics_summary = {
    'Model': 'flan-t5-base',
    'Training Samples': len(tokenized_dataset['train']),
    'Test Samples': len(tokenized_dataset['test']),
    'Epochs': NUM_EPOCHS,
    'Accuracy': accuracy,
    'F1 (Macro)': f1_macro,
    'F1 (Weighted)': f1_weighted,
    'ROUGE-1': eval_results['eval_rouge1'],
    'ROUGE-2': eval_results['eval_rouge2'],
    'ROUGE-L': eval_results['eval_rougeL'],
}

metrics_df = pd.DataFrame([metrics_summary])
metrics_df.to_csv('training_metrics_summary.csv', index=False)
print("Metrics summary saved to 'training_metrics_summary.csv'")

Detailed predictions saved to 'model_predictions.csv'
Metrics summary saved to 'training_metrics_summary.csv'


In [22]:
# Cell 18: Create Downloadable Archives
import shutil

print("Creating downloadable archives...")

# Archive all checkpoints
shutil.make_archive('all_checkpoints', 'zip', './results')
print("  - all_checkpoints.zip (all training checkpoints)")

# Archive final model
shutil.make_archive('final_model', 'zip', './final_finetuned_model')
print("  - final_model.zip (final trained model)")

print("\nAll archives created successfully!")

Creating downloadable archives...
  - all_checkpoints.zip (all training checkpoints)
  - final_model.zip (final trained model)

All archives created successfully!


In [23]:
# Cell 19: Model Inference Example
print("="*60)
print("MODEL INFERENCE EXAMPLE")
print("="*60)

# Load the final model
inference_model = T5ForConditionalGeneration.from_pretrained(final_model_path)
inference_tokenizer = T5Tokenizer.from_pretrained(final_model_path)

# Example news article
example_article = """
Apple Inc. announced record quarterly earnings with iPhone revenue up 25% year-over-year. 
The company also announced a new $90 billion stock buyback program and raised its dividend by 4%.
CEO Tim Cook cited strong demand across all product categories.
"""

# Prepare input
input_text = "Analyze the financial sentiment: " + example_article
inputs = inference_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate prediction
with torch.no_grad():
    outputs = inference_model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)

prediction = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"\nInput Article:")
print(example_article.strip())
print(f"\nModel Prediction:")
print(prediction)

MODEL INFERENCE EXAMPLE

Input Article:
Apple Inc. announced record quarterly earnings with iPhone revenue up 25% year-over-year. 
The company also announced a new $90 billion stock buyback program and raised its dividend by 4%.
CEO Tim Cook cited strong demand across all product categories.

Model Prediction:
Sentiment: Good. Reason: Strong earnings beat, massive buyback, and dividend increase are all highly bullish signals for investors.


In [25]:
# Cell 20: Final Summary
print("\n" + "="*60)
print("TRAINING AND EVALUATION COMPLETE")
print("="*60)
print(f"\nTest Set Performance:")
print(f"  Accuracy: {accuracy*100:.2f}%")
print(f"  F1-Score (Macro): {f1_macro:.4f}")
print(f"  F1-Score (Weighted): {f1_weighted:.4f}")
print(f"  ROUGE-L: {eval_results['eval_rougeL']:.4f}")

print(f"\nOutputs Saved:")
print(f"  - Final model: ./final_finetuned_model/")
print(f"  - All checkpoints: ./results/")
print(f"  - Predictions: model_predictions.csv")
print(f"  - Metrics: training_metrics_summary.csv")
print(f"  - Confusion matrix: confusion_matrix.png")
print(f"  - Downloadable archives: all_checkpoints.zip, final_model.zip")

print(f"\nCheckpoints saved (one per epoch):")
import os
checkpoints = [d for d in os.listdir('./results') if d.startswith('checkpoint-')]
for cp in sorted(checkpoints):
    print(f"  - {cp}")


TRAINING AND EVALUATION COMPLETE

Test Set Performance:
  Accuracy: 95.85%
  F1-Score (Macro): 0.8888
  F1-Score (Weighted): 0.9560
  ROUGE-L: 0.4643

Outputs Saved:
  - Final model: ./final_finetuned_model/
  - All checkpoints: ./results/
  - Predictions: model_predictions.csv
  - Metrics: training_metrics_summary.csv
  - Confusion matrix: confusion_matrix.png
  - Downloadable archives: all_checkpoints.zip, final_model.zip

Checkpoints saved (one per epoch):
  - checkpoint-1308
  - checkpoint-1744
