In [1]:
# Basic setup, environment, and initial imports
import os
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_KERAS_NO_ATOMIC_CHECKPOINT'] = '1'

import random
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K

from dataclasses import dataclass
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from typing import List, Dict

from transformers import (
    DistilBertTokenizer,
    TFDistilBertForSequenceClassification,
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# Enable mixed precision on TensorFlow if GPU supports it
if tf.config.experimental.list_physical_devices('GPU'):
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce GTX 1650, compute capability 7.5


In [3]:
@dataclass
class TFTrainingConfig:
    """Configuration for TensorFlow DistilBERT training."""
    labeled_file: str = '../Data/NLP/news_dataset_id2_labeled.csv'
    max_sequence_length: int = 128
    test_size: float = 0.25
    val_split: float = 0.3
    model_name: str = 'distilbert-base-uncased'
    num_labels: int = 3
    num_epochs: int = 8
    batch_size: int = 16
    learning_rate: float = 3e-5
    early_stopping_patience: int = 3
    augmentation_factor: int = 3
    output_dir: str = './tf_models'
    model_save_path: str = './tf_models/distilbert_final'
    history_save_path: str = './tf_models/training_history.json'

In [4]:
class FocalLoss(tf.keras.losses.Loss):
    """
    Keras implementation of Focal Loss.
    α = 1, γ = 2 by default.
    Expects logits input (from_logits=True).
    """
    def __init__(self, alpha=1.0, gamma=2.0, from_logits=True, name='focal_loss'):
        super().__init__(name=name)
        self.alpha = alpha
        self.gamma = gamma
        self.from_logits = from_logits

    def call(self, y_true, y_pred):
        y_true = tf.reshape(y_true, [-1])
        if self.from_logits:
            ce = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tf.cast(y_true, tf.int32),
                logits=y_pred
            )
            pt = tf.exp(-ce)
        else:
            y_true_ohe = tf.one_hot(tf.cast(y_true, tf.int32), depth=y_pred.shape[-1])
            ce = -tf.reduce_sum(y_true_ohe * tf.math.log(y_pred + 1e-9), axis=-1)
            pt = tf.reduce_sum(y_true_ohe * y_pred, axis=-1)

        focal_factor = self.alpha * tf.pow(1 - pt, self.gamma)
        loss = focal_factor * ce
        return tf.reduce_mean(loss)

In [5]:
class DataAugmenter:
    """Simple text augmentation for minority classes."""
    @staticmethod
    def synonym_replacement(text: str, n: int = 2) -> str:
        words = text.split()
        if len(words) < 3:
            return text
        indices = random.sample(range(len(words)), min(n, len(words)//3))
        for i in indices:
            if i < len(words) - 1:
                words[i], words[i+1] = words[i+1], words[i]
        return ' '.join(words)

    @staticmethod
    def augment_text(text: str, method: str = 'synonym') -> str:
        if method == 'synonym':
            return DataAugmenter.synonym_replacement(text)
        else:
            return text

In [6]:
def prepare_training_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    Combines 'title', 'description', and first 150 tokens of 'content' into 'training_text'.
    """
    def _combine_fields(row):
        parts = []
        for field in ['title', 'description', 'content']:
            if field in row and pd.notna(row[field]):
                text = str(row[field]).strip()
                if field == 'content':
                    text = ' '.join(text.split()[:150])
                if text:
                    parts.append(text)
        return ' '.join(parts)

    df['training_text'] = df.apply(_combine_fields, axis=1)
    df = df[df['training_text'].str.len() > 10].reset_index(drop=True)
    return df

In [7]:
# Initialize configuration and constants
config = TFTrainingConfig()
LABEL_MAPPING = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

# Load and prepare data
raw_df = pd.read_csv(config.labeled_file)
logger.info(f"Raw dataset shape: {raw_df.shape}")

df = prepare_training_text(raw_df)
df['label'] = df['stock_sentiment'].map(LABEL_MAPPING)
df = df.dropna(subset=['label']).reset_index(drop=True)
df['label'] = df['label'].astype(int)

2025-06-03 10:28:04,797 - INFO - Raw dataset shape: (675, 8)


In [8]:
# Log class distribution
dist = df['stock_sentiment'].value_counts()
logger.info("Class distribution (pre-augmentation):")
for lab, count in dist.items():
    logger.info(f"  {lab}: {count}  ({count/len(df)*100:.1f}%)")

# Augment minority classes
class_counts = df['label'].value_counts()
max_count = class_counts.max()
augmented_rows = []

for label in [0, 2]:  # Negative and Positive
    class_df = df[df['label'] == label]
    current = len(class_df)
    target = min(max_count // 2, current * config.augmentation_factor)
    if target > current:
        augment_needed = target - current
        logger.info(f"Augmenting {label} ({current} → {target})")
        for _ in range(augment_needed):
            row = class_df.sample(1).iloc[0].copy()
            row['training_text'] = DataAugmenter.augment_text(row['training_text'])
            augmented_rows.append(row)

if augmented_rows:
    aug_df = pd.DataFrame(augmented_rows)
    df = pd.concat([df, aug_df], ignore_index=True)
    logger.info(f"Dataset shape after augmentation: {df.shape}")

2025-06-03 10:28:04,840 - INFO - Class distribution (pre-augmentation):
2025-06-03 10:28:04,841 - INFO -   Neutral: 466  (69.0%)
2025-06-03 10:28:04,841 - INFO -   Positive: 188  (27.9%)
2025-06-03 10:28:04,842 - INFO -   Negative: 21  (3.1%)
2025-06-03 10:28:04,844 - INFO - Augmenting 0 (21 → 63)
2025-06-03 10:28:04,856 - INFO - Augmenting 2 (188 → 233)
2025-06-03 10:28:04,872 - INFO - Dataset shape after augmentation: (762, 10)


In [9]:
# Split data
texts = df['training_text'].tolist()
labels = df['label'].tolist()

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=config.test_size, random_state=42, stratify=labels
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=config.val_split, random_state=42, stratify=temp_labels
)

logger.info(f"Final split → Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

2025-06-03 10:28:04,886 - INFO - Final split → Train: 571, Val: 133, Test: 58


In [10]:
# Initialize tokenizer and tokenize data
tokenizer = DistilBertTokenizer.from_pretrained(config.model_name)

def batch_tokenize(texts: List[str]) -> Dict[str, np.ndarray]:
    enc = tokenizer(
        texts,
        truncation=True,
        padding='longest',
        max_length=config.max_sequence_length,
        return_tensors='np'
    )
    return {
        'input_ids': enc['input_ids'],
        'attention_mask': enc['attention_mask']
    }

logger.info("Tokenizing splits...")
train_enc = batch_tokenize(train_texts)
val_enc = batch_tokenize(val_texts)
test_enc = batch_tokenize(test_texts)

2025-06-03 10:28:05,297 - INFO - Tokenizing splits...


In [11]:
# Create TensorFlow datasets
def make_tf_dataset(encodings: Dict[str, np.ndarray], labels: List[int], batch_size: int, shuffle: bool = False):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels_arr = np.array(labels, dtype=np.int32)

    ds = tf.data.Dataset.from_tensor_slices(({
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }, labels_arr))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(labels_arr), seed=42)

    ds = ds.batch(batch_size, drop_remainder=False)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_tf_dataset(train_enc, train_labels, config.batch_size, shuffle=True)
val_ds = make_tf_dataset(val_enc, val_labels, config.batch_size, shuffle=False)
test_ds = make_tf_dataset(test_enc, test_labels, config.batch_size, shuffle=False)

In [12]:
# Initialize and compile model
model = TFDistilBertForSequenceClassification.from_pretrained(
    config.model_name,
    num_labels=config.num_labels,
    problem_type="single_label_classification"
)

optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate)
loss_fn = FocalLoss(alpha=1.0, gamma=2.0, from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]

model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [13]:
# Prepare output directories
Path(config.output_dir).mkdir(parents=True, exist_ok=True)
Path(config.model_save_path).mkdir(parents=True, exist_ok=True)

In [14]:
# Custom callback to track best validation accuracy
class BestModelTracker(tf.keras.callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.best_val_acc = 0.0
        self.best_weights = None
        self.patience_counter = 0
        
    def on_epoch_end(self, epoch, logs=None):
        current_val_acc = logs.get('val_accuracy', 0)
        if current_val_acc > self.best_val_acc:
            self.best_val_acc = current_val_acc
            self.best_weights = self.model.get_weights()
            self.patience_counter = 0
            logger.info(f"New best validation accuracy: {current_val_acc:.4f}")
        else:
            self.patience_counter += 1
            
        if self.patience_counter >= config.early_stopping_patience:
            logger.info(f"Early stopping triggered after {epoch + 1} epochs")
            self.model.stop_training = True

early_stop = EarlyStopping(
    monitor='val_accuracy',
    patience=config.early_stopping_patience,
    mode='max',
    restore_best_weights=True,
    verbose=1
)

best_tracker = BestModelTracker()

In [15]:
# Train the model
logger.info("Starting training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=config.num_epochs,
    callbacks=[early_stop, best_tracker]
)

2025-06-03 10:28:09,850 - INFO - Starting training...


Epoch 1/8

2025-06-03 10:29:32,696 - INFO - New best validation accuracy: 0.6316


Epoch 2/8

2025-06-03 10:30:42,935 - INFO - New best validation accuracy: 0.7218


Epoch 3/8

2025-06-03 10:31:53,296 - INFO - New best validation accuracy: 0.8045


Epoch 4/8

2025-06-03 10:33:03,362 - INFO - New best validation accuracy: 0.8421


Epoch 5/8
Epoch 6/8
Epoch 7/8


2025-06-03 10:36:32,856 - INFO - Early stopping triggered after 7 epochs


Epoch 7: early stopping


In [16]:
# Restore best weights if available
if best_tracker.best_weights is not None:
    model.set_weights(best_tracker.best_weights)
    logger.info(f"Restored best model weights (val_acc: {best_tracker.best_val_acc:.4f})")

2025-06-03 10:36:32,942 - INFO - Restored best model weights (val_acc: 0.8421)


In [17]:
# Save training history
with open(config.history_save_path, 'w') as f:
    json.dump(history.history, f, indent=2)

In [18]:
# Evaluate on test set
logger.info("Evaluating on test set...")
predictions = model.predict(test_ds).logits
pred_labels = np.argmax(predictions, axis=1)
true_labels = np.concatenate([y.numpy() for _, y in test_ds], axis=0)

test_acc = accuracy_score(true_labels, pred_labels)
test_f1_macro = f1_score(true_labels, pred_labels, average='macro')

logger.info(f"Test Accuracy: {test_acc:.4f}")
logger.info(f"Test F1 (macro): {test_f1_macro:.4f}")

# Classification report
label_names = ['Negative', 'Neutral', 'Positive']
report = classification_report(true_labels, pred_labels, target_names=label_names, output_dict=True)
print(classification_report(true_labels, pred_labels, target_names=label_names))

2025-06-03 10:36:32,977 - INFO - Evaluating on test set...




2025-06-03 10:36:36,463 - INFO - Test Accuracy: 0.7759
2025-06-03 10:36:36,464 - INFO - Test F1 (macro): 0.8024


              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00         5
     Neutral       0.78      0.89      0.83        35
    Positive       0.69      0.50      0.58        18

    accuracy                           0.78        58
   macro avg       0.82      0.80      0.80        58
weighted avg       0.77      0.78      0.77        58



In [19]:
# Save final model and tokenizer
hf_export_dir = os.path.join(config.model_save_path, "hf_pretrained")
saved_model_dir = os.path.join(config.model_save_path, "saved_model")
weights_dir = os.path.join(config.model_save_path, "weights")

Path(hf_export_dir).mkdir(parents=True, exist_ok=True)
Path(saved_model_dir).mkdir(parents=True, exist_ok=True)
Path(weights_dir).mkdir(parents=True, exist_ok=True)

# Save in Hugging Face format (this usually works)
try:
    model.save_pretrained(hf_export_dir)
    tokenizer.save_pretrained(hf_export_dir)
    logger.info(f"✅ Hugging Face format saved successfully: {hf_export_dir}")
except Exception as e:
    logger.error(f"❌ Failed to save Hugging Face format: {e}")

# Try to save as TensorFlow SavedModel
try:
    model.save(saved_model_dir, save_format="tf")
    logger.info(f"✅ TensorFlow SavedModel saved successfully: {saved_model_dir}")
except Exception as e:
    logger.warning(f"⚠️ Failed to save TensorFlow SavedModel due to file locking: {e}")
    
    # Alternative: Save just the weights
    try:
        weights_path = os.path.join(weights_dir, "model_weights.h5")
        model.save_weights(weights_path)
        logger.info(f"✅ Model weights saved as alternative: {weights_path}")
        
        # Save model config for reconstruction
        config_path = os.path.join(weights_dir, "model_config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'model_name': config.model_name,
                'num_labels': config.num_labels,
                'max_sequence_length': config.max_sequence_length
            }, f, indent=2)
        logger.info(f"✅ Model config saved: {config_path}")
        
    except Exception as e2:
        logger.error(f"❌ Failed to save weights as well: {e2}")
        logger.info("💡 You can still use the Hugging Face format if it saved successfully")

2025-06-03 10:36:37,147 - INFO - ✅ Hugging Face format saved successfully: ./tf_models/distilbert_final\hf_pretrained
























; Broken pipe [Op:SaveV2]
2025-06-03 10:36:56,861 - INFO - ✅ Model weights saved as alternative: ./tf_models/distilbert_final\weights\model_weights.h5
2025-06-03 10:36:56,862 - INFO - ✅ Model config saved: ./tf_models/distilbert_final\weights\model_config.json


In [20]:
# Save evaluation results
with open(os.path.join(config.model_save_path, "test_results.json"), 'w') as f:
    json.dump({
        'test_accuracy': float(test_acc),
        'test_f1_macro': float(test_f1_macro),
        'classification_report': report
    }, f, indent=2)

logger.info(f"🎯 Training complete! Final results:")
logger.info(f"   📊 Test Accuracy: {test_acc:.4f}")
logger.info(f"   📊 Test F1 (macro): {test_f1_macro:.4f}")
logger.info(f"   💾 Main model directory: {config.model_save_path}")
if os.path.exists(hf_export_dir):
    logger.info(f"   🤗 Hugging Face format: {hf_export_dir}")
if os.path.exists(saved_model_dir) and len(os.listdir(saved_model_dir)) > 0:
    logger.info(f"   🔧 TensorFlow SavedModel: {saved_model_dir}")
if os.path.exists(weights_dir) and len(os.listdir(weights_dir)) > 0:
    logger.info(f"   ⚖️ Model weights backup: {weights_dir}")

2025-06-03 10:36:57,028 - INFO - 🎯 Training complete! Final results:
2025-06-03 10:36:57,029 - INFO -    📊 Test Accuracy: 0.7759
2025-06-03 10:36:57,030 - INFO -    📊 Test F1 (macro): 0.8024
2025-06-03 10:36:57,031 - INFO -    💾 Main model directory: ./tf_models/distilbert_final
2025-06-03 10:36:57,032 - INFO -    🤗 Hugging Face format: ./tf_models/distilbert_final\hf_pretrained
2025-06-03 10:36:57,033 - INFO -    🔧 TensorFlow SavedModel: ./tf_models/distilbert_final\saved_model
2025-06-03 10:36:57,035 - INFO -    ⚖️ Model weights backup: ./tf_models/distilbert_final\weights
