# # Air-Senti-X Ensemble Training Notebook
#
# This notebook implements an end-to-end training pipeline for the Air-Senti-X project.
# We:
#
# 1. Preprocess the dataset: Clean, split, encode, and tokenize the tweets.
# 2. Build three BERT-based models with different architectures:
#    - BERT+LSTM
#    - BERT+BiLSTM
#    - BERT+CNN
# 3. Train each model with callbacks (TensorBoard logging, early stopping, and checkpoints).
# 4. Ensemble the predictions using weighted averaging and apply confidence filtering.
# 5. Evaluate the ensemble performance using accuracy, F1, and a confusion matrix.
# 6. Save the trained models in H5 format and convert the best model to ONNX (optional).
#
# **Project Directory Structure:**
# ```
# Air-Senti-X/
# ├── dataset/
# │   └── Tweets.csv
# ├── preprocessing/
# │   ├── data_cleaning.py
# │   ├── data_split_encode.py
# │   └── tokenize_bert.py
# ├── models/
# │   ├── architectures/
# │   │   ├── bert_lstm.py       # defines build_bert_lstm(max_len, num_labels)
# │   │   ├── bert_bilstm.py     # defines build_bert_bilstm(max_len, num_labels)
# │   │   └── bert_cnn.py        # defines build_bert_cnn(max_len, num_labels)
# │   └── saved/                 # directory to store trained models
# ├── utils/
# │   ├── evaluation.py        # includes get_emotion() and calculate_urgency()
# └── Train_Ensemble_All.ipynb   # This notebook
# ```
# 
# Make sure to install dependencies and that your virtual environment is activated.


In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
import logging
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime

# Import preprocessing functions
from preprocessing.data_cleaning import preprocess_dataset
from preprocessing.data_split_encode import split_and_encode
from preprocessing.tokenize_bert import bert_tokenize

# Import model builders
from models.architectures.bert_lstm import build_bert_lstm
from models.architectures.bert_bilstm import build_bert_bilstm
from models.architectures.bert_cnn import build_bert_cnn

from utils.evaluation import get_emotion, calculate_urgency

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhars\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# ## Configuration and Logging Setup
#
# We configure our training parameters and set up logging (both to console and a log file).
#
# We'll also set up TensorBoard logging.

In [2]:
# Configuration
DATA_PATH = 'dataset/Tweets.csv'
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 1
ensemble_weights = [1.0, 1.0, 1.0]  # Equal weighting for each model
CONFIDENCE_THRESHOLD = 0.5

In [3]:
# Directory for saving models and logs
MODEL_SAVE_DIR = os.path.join("models", "saved")
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
LOG_DIR = os.path.join("logs", "fit", datetime.now().strftime("%Y%m%d-%H%M%S"))
os.makedirs(LOG_DIR, exist_ok=True)

In [4]:
# Set up logging to file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s',
    filename='training.log',
    filemode='w'
)
logger = logging.getLogger()

In [5]:
# TensorBoard callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)

# ## Step 1: Data Preprocessing
#
# We clean the dataset, split into training and testing sets, and tokenize the tweet text using the BERT tokenizer.


In [6]:
print("[INFO] Cleaning dataset...")
logger.info("Cleaning dataset...")
df = preprocess_dataset(DATA_PATH)

print("[INFO] Splitting dataset and encoding labels...")
logger.info("Splitting dataset and encoding labels...")
train_df, test_df, train_labels, test_labels = split_and_encode(df, label_col='airline_sentiment')

print("[INFO] Tokenizing text using BERT...")
logger.info("Tokenizing text using BERT...")
X_train_input_ids, X_train_attention_masks = bert_tokenize(train_df['text'].tolist(), max_len=MAX_LEN)
X_test_input_ids, X_test_attention_masks = bert_tokenize(test_df['text'].tolist(), max_len=MAX_LEN)

[INFO] Cleaning dataset...
[INFO] Splitting dataset and encoding labels...
[INFO] Tokenizing text using BERT...


# ## Step 2: Model Building and Training
#
# We build three models with different architectures: BERT+LSTM, BERT+BiLSTM, and BERT+CNN.
# Each model is compiled with an Adam optimizer and trained with early stopping and model checkpoint callbacks.
#
# We define a helper function `train_model` to train each model and return its predictions on the test set.

In [8]:
num_labels = len(set(train_labels))
print(f"[INFO] Number of classes: {num_labels}")
logger.info(f"Number of classes: {num_labels}")

def get_callbacks(model_name):
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(MODEL_SAVE_DIR, f"{model_name}.h5"),
        monitor="val_accuracy",
        save_best_only=True,
        verbose=1
    )
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=2, restore_best_weights=True, verbose=1
    )
    return [checkpoint, early_stop, tensorboard_callback]

def train_model(model_fn, model_name):
    print(f"[INFO] Building and training {model_name}...")
    logger.info(f"Building and training {model_name}...")
    model = model_fn(MAX_LEN, num_labels)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    callbacks = get_callbacks(model_name)
    model.fit(
        [X_train_input_ids, X_train_attention_masks],
        train_labels,
        validation_split=0.1,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )
    # Save model is done in callbacks; return predictions on test set
    preds = model.predict([X_test_input_ids, X_test_attention_masks])
    return model, preds

[INFO] Number of classes: 3


In [9]:
# Train each model
model_lstm, preds_lstm = train_model(build_bert_lstm, "bert_lstm")
model_bilstm, preds_bilstm = train_model(build_bert_bilstm, "bert_bilstm")
model_cnn, preds_cnn = train_model(build_bert_cnn, "bert_cnn")


[INFO] Building and training bert_lstm...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1: val_accuracy improved from -inf to 0.79481, saving model to models\saved\bert_lstm.h5
[INFO] Building and training bert_bilstm...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1: val_accuracy improved from -inf to 0.78961, saving model to models\saved\bert_bilstm.h5
[INFO] Building and training bert_cnn...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1: val_accuracy improved from -inf to 0.78615, saving model to models\saved\bert_cnn.h5


# ## Step 3: Ensemble Predictions
#
# We combine the outputs of the three models by weighted averaging (soft voting).
# We then determine the final predicted class, check the confidence of predictions,
# and apply confidence-based filtering if needed.

In [10]:
print("[INFO] Ensemble predictions...")
logger.info("Ensembling predictions...")

[INFO] Ensemble predictions...


In [11]:
# Convert predictions to numpy arrays if needed
all_preds = np.array([preds_lstm, preds_bilstm, preds_cnn])  # shape: (3, num_samples, num_labels)
weighted_preds = np.average(all_preds, axis=0, weights=ensemble_weights)
final_preds = tf.argmax(weighted_preds, axis=1).numpy()
max_confidences = np.max(weighted_preds, axis=1)
low_confidence_count = np.sum(max_confidences < CONFIDENCE_THRESHOLD)

In [12]:
logger.info(f"Low-confidence predictions: {low_confidence_count}")
print(f"[INFO] {low_confidence_count} predictions below confidence threshold.")

[INFO] 185 predictions below confidence threshold.


# ## Step 4: Evaluation
#
# We evaluate the ensemble performance using a classification report and a confusion matrix.


In [13]:
print("[RESULT] Ensemble Classification Report:")
report = classification_report(test_labels, final_preds)
conf_matrix = confusion_matrix(test_labels, final_preds)
print(report)
print("Confusion Matrix:")
print(conf_matrix)
logger.info("Classification Report:\n" + report)
logger.info("Confusion Matrix:\n" + str(conf_matrix))

[RESULT] Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1811
           1       0.68      0.52      0.59       617
           2       0.74      0.76      0.75       458

    accuracy                           0.80      2886
   macro avg       0.76      0.73      0.74      2886
weighted avg       0.80      0.80      0.80      2886

Confusion Matrix:
[[1655  102   54]
 [ 229  318   70]
 [  66   46  346]]


# ## Step 5: Save Models and Convert Best Model to ONNX (Optional)
#
# We save each model in H5 format. We then attempt to convert the best model (based on average max confidence) to ONNX.
# For ONNX conversion, ensure you have installed `tf2onnx`.

In [14]:
for model, name in zip([model_lstm, model_bilstm, model_cnn], ["bert_lstm", "bert_bilstm", "bert_cnn"]):
    save_path = os.path.join(MODEL_SAVE_DIR, f"{name}.h5")
    model.save(save_path)
    logger.info(f"{name} model saved at {save_path}")
    print(f"[INFO] {name} model saved at {save_path}")

[INFO] bert_lstm model saved at models\saved\bert_lstm.h5
[INFO] bert_bilstm model saved at models\saved\bert_bilstm.h5
[INFO] bert_cnn model saved at models\saved\bert_cnn.h5


In [15]:
# Optional: Convert best model to ONNX
try:
    import tf2onnx
    avg_confidences = [np.mean(np.max(pred, axis=1)) for pred in [preds_lstm, preds_bilstm, preds_cnn]]
    best_idx = np.argmax(avg_confidences)
    best_model = [model_lstm, model_bilstm, model_cnn][best_idx]
    onnx_path = os.path.join(MODEL_SAVE_DIR, f"{['bert_lstm','bert_bilstm','bert_cnn'][best_idx]}.onnx")
    spec = (tf.TensorSpec(best_model.inputs[0].shape, tf.int32, name="input_ids"),
            tf.TensorSpec(best_model.inputs[1].shape, tf.int32, name="attention_mask"))
    model_proto, _ = tf2onnx.convert.from_keras(best_model, input_signature=spec, opset=13)
    with open(onnx_path, "wb") as f:
        f.write(model_proto.SerializeToString())
    logger.info(f"Best model converted to ONNX and saved at {onnx_path}")
    print(f"[INFO] Best model converted to ONNX and saved at {onnx_path}")
except ImportError:
    logger.warning("tf2onnx not installed. Skipping ONNX conversion.")
    print("[WARNING] tf2onnx not installed. Skipping ONNX conversion.")
except Exception as e:
    logger.error("Error during ONNX conversion: " + str(e))
    print("[ERROR] Error converting model to ONNX:", e)

logger.info("Training and ensemble evaluation complete.")
print("[INFO] Training and ensemble evaluation complete.")

[INFO] Best model converted to ONNX and saved at models\saved\bert_bilstm.onnx
[INFO] Training and ensemble evaluation complete.
