In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gru/keras/default/1/sentiment_analysis_model.h5
/kaggle/input/amazon-dataset/data.npz
/kaggle/input/kd/keras/default/1/student_model_final__kd.h5
/kaggle/input/emb/keras/default/1/fully_pruned_model_latest.h5


In [2]:

import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models, losses, optimizers
from tensorflow.keras.models import load_model
from tqdm import tqdm  # Import tqdm

# -----------------------------------------------------------------------------
# 2. Data Loading and Preprocessing
# -----------------------------------------------------------------------------
data = np.load("/kaggle/input/amazon-dataset/data.npz")
X_train_np, y_train_np = data["X_train"], data["y_train"]
X_val_np, y_val_np = data["X_val"], data["y_val"]
X_test_np, y_test_np = data["X_test"], data["y_test"]

X_train = tf.convert_to_tensor(X_train_np, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train_np, dtype=tf.float32)
X_val = tf.convert_to_tensor(X_val_np, dtype=tf.int32)
y_val = tf.convert_to_tensor(y_val_np, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test_np, dtype=tf.int32)
y_test = tf.convert_to_tensor(y_test_np, dtype=tf.float32)

batch_size = 1024 # Reduced batch size for single GPU
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size).repeat().prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
# -----------------------------------------------------------------------------
# 3. Load Models (Teacher and Pruned Student with Integrated Mapping Layers)
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# 3. Load Models (Teacher and Pruned Student)
# -----------------------------------------------------------------------------
# Load the models
teacher_model = load_model('/kaggle/input/gru/keras/default/1/sentiment_analysis_model.h5')
original_student = load_model('/kaggle/input/emb/keras/default/1/fully_pruned_model_latest.h5')

# Define the input layer, matching the original student input shape
inputs = tf.keras.Input(shape=(X_train.shape[1],), dtype=tf.int32, name='input_layer')

# Directly reuse layers from the original student model
x = original_student.get_layer('embedding')(inputs)  # Embedding layer

# First Bidirectional GRU layer
x = original_student.get_layer('bidirectional')(x)

# Second Bidirectional GRU layer
x = original_student.get_layer('bidirectional_1')(x)

# Flatten the output to pass it to the dense layers
x = original_student.get_layer('flatten')(x)

# Apply dropout after flattening
x = original_student.get_layer('dropout')(x)

# First Dense layer
x = original_student.get_layer('dense')(x)

# Apply dropout after the first dense layer
x = original_student.get_layer('dropout_1')(x)

# Second Dense layer
x = original_student.get_layer('dense_1')(x)

# Final output layer (single output for binary classification)
outputs = original_student.get_layer('dense_2')(x)

# Create the new student model that should match the original model exactly
student_model = tf.keras.Model(inputs=inputs, outputs=outputs, name='student_with_same_architecture')

# Print out the summary to verify the architecture
student_model.summary()


teacher_dense_map_bigru1 = layers.Dense(64, name='teacher_mapping_bigru1')
teacher_dense_map_bigru2 = layers.Dense(128, name='teacher_mapping_bigru2')

student_dense_map_bigru1 = layers.Dense(64, name='student_mapping_bigru1')
student_dense_map_bigru2 = layers.Dense(128, name='student_mapping_bigru2')

In [3]:
# -----------------------------------------------------------------------------
# 5. Feature extraction functions (Teacher and Student)
# -----------------------------------------------------------------------------
def get_teacher_features(images):
    embedding_output = teacher_model.get_layer('embedding')(images)
    bigru1_output = teacher_model.get_layer('bidirectional')(embedding_output)
    t_feat1 = teacher_dense_map_bigru1(bigru1_output)

    bigru2_input = t_feat1
    bigru2_output = teacher_model.get_layer('bidirectional_1')(bigru2_input)
    t_feat2 = teacher_dense_map_bigru2(bigru2_output)
    return [t_feat1, t_feat2]

def get_student_features(images):
    embedding_output = student_model.get_layer('embedding')(images)
    bigru1_output = student_model.get_layer('bidirectional')(embedding_output)
    s_feat1 = student_dense_map_bigru1(bigru1_output)

    bigru2_input = s_feat1
    bigru2_output = student_model.get_layer('bidirectional_1')(bigru2_input)
    s_feat2 = student_dense_map_bigru2(bigru2_output)
    return [s_feat1, s_feat2]



In [4]:
# -----------------------------------------------------------------------------
# 6. Loss Functions and Attention-based Feature Distillation (AFD)
# -----------------------------------------------------------------------------
bce_loss = losses.BinaryCrossentropy(from_logits=False)
mse_loss = losses.MeanSquaredError()

def attention_distillation_loss(teacher_features, student_features):
    teacher_query = teacher_features  # Treat teacher features as queries
    student_key = student_features  # Treat student features as keys
    
    d_k = tf.cast(tf.shape(student_key)[-1], tf.float32)  # Feature dimension
    scale = tf.sqrt(d_k) + 1e-8  # Scaling factor to stabilize attention
    attn_scores = tf.matmul(teacher_query, student_key, transpose_b=True) / scale  # Attention scores (query x key^T)
    
    attn_scores = tf.clip_by_value(attn_scores, -50.0, 50.0)  # Clip extreme values
    attn_weights = tf.nn.softmax(attn_scores, axis=-1)  # Softmax to get the attention weights
    
    attn_output = tf.matmul(attn_weights, student_features)  # Weighted student features
    return mse_loss(teacher_features, attn_output)

def distillation_loss(teacher_out, student_out, teacher_feats, student_feats, labels, beta):
    feat_loss1 = attention_distillation_loss(teacher_feats[0], student_feats[0])
    feat_loss2 = mse_loss(teacher_feats[1], student_feats[1])
    hard_loss = bce_loss(labels, student_out)
    return hard_loss + beta * (feat_loss1 + feat_loss2) 


In [5]:
# -----------------------------------------------------------------------------
# 7. Training Setup
# -----------------------------------------------------------------------------
optimizer = optimizers.Adam(learning_rate=1e-4)
epochs = 20
beta = 0.5

# Create pruned_mask to identify pruned weights (weights = 0)
pruned_mask = [tf.cast(tf.abs(w) > 0, dtype=tf.float32) for w in student_model.trainable_variables]

train_loss_metric = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
val_loss_metric = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)
val_accuracy_metric = tf.keras.metrics.BinaryAccuracy('val_accuracy')


In [6]:
# Early stopping parameters
patience = 3  # Number of epochs to wait for improvement
min_delta = 0.001  # Minimum change in validation loss to qualify as improvement
best_val_loss = float('inf')  # Track the best validation loss
wait = 0  # Counter for epochs without improvement

In [7]:
# -----------------------------------------------------------------------------
# 8. Training Step (Freeze Pruned Weights)
# -----------------------------------------------------------------------------
@tf.function
def train_step(images, labels):
    labels = tf.reshape(labels, (-1, 1))

    with tf.GradientTape() as tape:
        teacher_out = teacher_model(images, training=False)
        student_out = student_model(images, training=True)

        t_feats = get_teacher_features(images)
        s_feats = get_student_features(images)

        total_loss = distillation_loss(teacher_out, student_out, t_feats, s_feats, labels, beta)

    # Compute gradients
    grads = tape.gradient(total_loss, student_model.trainable_variables)

    # Mask gradients for pruned weights (freeze pruned weights)
    masked_grads = []
    for grad, mask in zip(grads, pruned_mask):
        masked_grads.append(grad * mask)  # Freeze pruned weights by masking gradients

    # Apply masked gradients
    optimizer.apply_gradients(zip(masked_grads, student_model.trainable_variables))
    train_loss_metric.update_state(total_loss)



In [8]:
# -----------------------------------------------------------------------------
# 9. Validation Step
# -----------------------------------------------------------------------------
@tf.function
def val_step(images, labels):
    student_out = student_model(images, training=False)
    loss = bce_loss(labels, student_out)
    val_loss_metric.update_state(loss)
    val_accuracy_metric.update_state(labels, student_out)

In [9]:
def layerwise_trainable_zero_nonzero_params(model):
    for layer in model.layers:
        if hasattr(layer, 'trainable_variables'):
            for var in layer.trainable_variables:
                total = tf.size(var).numpy()
                zero = np.sum(var.numpy() == 0)
                nonzero = total - zero
                print(f"Layer: {layer.name} | Total: {total} | Zero: {zero} | Non-Zero: {nonzero}")


In [10]:
# -----------------------------------------------------------------------------
# 10. Training Loop with Early Stopping
# -----------------------------------------------------------------------------
num_batches_per_epoch = len(X_train_np) // batch_size
num_val_batches = len(X_val_np) // batch_size  # Calculate validation batches

# Training Loop
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")

    train_loss_metric.reset_state()  # Reset the metrics here
    val_loss_metric.reset_state()
    val_accuracy_metric.reset_state()

    # Training loop with tqdm progress bar
    with tqdm(total=num_batches_per_epoch, desc="Training") as pbar:
        for step, (images, labels) in enumerate(train_dataset):  # Enumerate the dataset
            if step >= num_batches_per_epoch:  # Break the loop when it reaches the end
                break
            train_step(images, labels)
            pbar.set_postfix(loss=train_loss_metric.result().numpy())
            pbar.update(1)

    # Validation loop
    with tqdm(total=num_val_batches, desc="Validation") as pbar:
        for step, (images, labels) in enumerate(val_dataset):  # Enumerate the dataset
            if step >= num_val_batches:  # Break the loop when it reaches the end
                break
            val_step(images, labels)
            pbar.set_postfix(val_loss=val_loss_metric.result().numpy(), val_accuracy=val_accuracy_metric.result().numpy())
            pbar.update(1)

    # Get the current validation loss
    current_val_loss = val_loss_metric.result().numpy()

    # Early stopping logic
    if current_val_loss < best_val_loss - min_delta:
        print(f"Validation loss improved from {best_val_loss:.4f} to {current_val_loss:.4f}")
        best_val_loss = current_val_loss
        wait = 0  # Reset the wait counter
    else:
        wait += 1
        print(f"Validation loss did not improve. Patience: {wait}/{patience}")

    # Stop training if patience is exceeded
    if wait >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}!")
        break

    print(f"Training Loss: {train_loss_metric.result().numpy()}")
    print(f"Validation Loss: {val_loss_metric.result().numpy()}, Validation Accuracy: {val_accuracy_metric.result().numpy()}")
    layerwise_trainable_zero_nonzero_params(student_model)
# -----------------------------------------------------------------------------
# 11. Save the Final Model
# -----------------------------------------------------------------------------
# Save the final student model
student_model.save('/kaggle/working/student_model_final.h5')  # Save as HDF5 file
print("Model saved successfully!")


Epoch 1/20


Training: 100%|██████████| 6835/6835 [32:52<00:00,  3.46it/s, loss=0.219]
Validation: 100%|██████████| 1464/1464 [00:56<00:00, 25.75it/s, val_accuracy=0.928, val_loss=0.199]


Validation loss improved from inf to 0.1986
Training Loss: 0.218705415725708
Validation Loss: 0.1986151784658432, Validation Accuracy: 0.9280778765678406
Layer: embedding | Total: 1920000 | Zero: 3392 | Non-Zero: 1916608
Layer: bidirectional | Total: 12288 | Zero: 11505 | Non-Zero: 783
Layer: bidirectional | Total: 12288 | Zero: 11488 | Non-Zero: 800
Layer: bidirectional | Total: 384 | Zero: 370 | Non-Zero: 14
Layer: bidirectional | Total: 12288 | Zero: 11346 | Non-Zero: 942
Layer: bidirectional | Total: 12288 | Zero: 11489 | Non-Zero: 799
Layer: bidirectional | Total: 384 | Zero: 358 | Non-Zero: 26
Layer: bidirectional_1 | Total: 24576 | Zero: 24058 | Non-Zero: 518
Layer: bidirectional_1 | Total: 12288 | Zero: 11971 | Non-Zero: 317
Layer: bidirectional_1 | Total: 384 | Zero: 379 | Non-Zero: 5
Layer: bidirectional_1 | Total: 24576 | Zero: 23691 | Non-Zero: 885
Layer: bidirectional_1 | Total: 12288 | Zero: 11709 | Non-Zero: 579
Layer: bidirectional_1 | Total: 384 | Zero: 358 | Non-Zero:

Training: 100%|██████████| 6835/6835 [32:37<00:00,  3.49it/s, loss=0.182]
Validation: 100%|██████████| 1464/1464 [00:55<00:00, 26.46it/s, val_accuracy=0.928, val_loss=0.201]


Validation loss did not improve. Patience: 1/3
Training Loss: 0.18198953568935394
Validation Loss: 0.20075227320194244, Validation Accuracy: 0.9278711080551147
Layer: embedding | Total: 1920000 | Zero: 3392 | Non-Zero: 1916608
Layer: bidirectional | Total: 12288 | Zero: 11505 | Non-Zero: 783
Layer: bidirectional | Total: 12288 | Zero: 11488 | Non-Zero: 800
Layer: bidirectional | Total: 384 | Zero: 370 | Non-Zero: 14
Layer: bidirectional | Total: 12288 | Zero: 11346 | Non-Zero: 942
Layer: bidirectional | Total: 12288 | Zero: 11489 | Non-Zero: 799
Layer: bidirectional | Total: 384 | Zero: 358 | Non-Zero: 26
Layer: bidirectional_1 | Total: 24576 | Zero: 24058 | Non-Zero: 518
Layer: bidirectional_1 | Total: 12288 | Zero: 11971 | Non-Zero: 317
Layer: bidirectional_1 | Total: 384 | Zero: 379 | Non-Zero: 5
Layer: bidirectional_1 | Total: 24576 | Zero: 23691 | Non-Zero: 885
Layer: bidirectional_1 | Total: 12288 | Zero: 11709 | Non-Zero: 579
Layer: bidirectional_1 | Total: 384 | Zero: 358 | Non

Training: 100%|██████████| 6835/6835 [32:40<00:00,  3.49it/s, loss=0.174]
Validation: 100%|██████████| 1464/1464 [01:21<00:00, 17.87it/s, val_accuracy=0.928, val_loss=0.203]


Validation loss did not improve. Patience: 2/3
Training Loss: 0.17373375594615936
Validation Loss: 0.20281369984149933, Validation Accuracy: 0.9275029301643372
Layer: embedding | Total: 1920000 | Zero: 3392 | Non-Zero: 1916608
Layer: bidirectional | Total: 12288 | Zero: 11505 | Non-Zero: 783
Layer: bidirectional | Total: 12288 | Zero: 11488 | Non-Zero: 800
Layer: bidirectional | Total: 384 | Zero: 370 | Non-Zero: 14
Layer: bidirectional | Total: 12288 | Zero: 11346 | Non-Zero: 942
Layer: bidirectional | Total: 12288 | Zero: 11489 | Non-Zero: 799
Layer: bidirectional | Total: 384 | Zero: 358 | Non-Zero: 26
Layer: bidirectional_1 | Total: 24576 | Zero: 24058 | Non-Zero: 518
Layer: bidirectional_1 | Total: 12288 | Zero: 11971 | Non-Zero: 317
Layer: bidirectional_1 | Total: 384 | Zero: 379 | Non-Zero: 5
Layer: bidirectional_1 | Total: 24576 | Zero: 23691 | Non-Zero: 885
Layer: bidirectional_1 | Total: 12288 | Zero: 11709 | Non-Zero: 579
Layer: bidirectional_1 | Total: 384 | Zero: 358 | Non

Training: 100%|██████████| 6835/6835 [32:44<00:00,  3.48it/s, loss=0.169]
Validation: 100%|██████████| 1464/1464 [00:56<00:00, 25.86it/s, val_accuracy=0.927, val_loss=0.205]


Validation loss did not improve. Patience: 3/3
Early stopping triggered at epoch 4!
Model saved successfully!


In [11]:
st_ml = tf.keras.models.load_model('/kaggle/working/student_model_final.h5')


In [12]:
st_ml.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

test_loss, test_accuracy = st_ml.evaluate(X_test, y_test, verbose=1)

print(f"Testing Loss: {test_loss:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

[1m46875/46875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m592s[0m 13ms/step - accuracy: 0.9275 - loss: 0.2051
Testing Loss: 0.2054
Testing Accuracy: 0.9273


In [13]:
st_ml.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

val_loss, val_accuracy = st_ml.evaluate(X_val, y_val, verbose=1)

print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

[1m46875/46875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 13ms/step - accuracy: 0.9278 - loss: 0.2050
Validation Loss: 0.2055
Validation Accuracy: 0.9273
