# Checking the file

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# ----------------------------
# Load Dataset
# ----------------------------
file_path = '/kaggle/input/azure-functions-2021-dataset/AzureFunctionsInvocationTraceForTwoWeeksJan2021.txt'
df = pd.read_csv(file_path, sep=',', header=None, names=['app','func','end_timestamp','duration'])

# ----------------------------
# Clean and convert columns
# ----------------------------
df['end_timestamp'] = pd.to_datetime(pd.to_numeric(df['end_timestamp'], errors='coerce'), unit='s')
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
df = df.dropna(subset=['app','func','end_timestamp','duration'])
df = df.sort_values(['app','func','end_timestamp']).reset_index(drop=True)

# ----------------------------
# Cold start flag
# ----------------------------
df['prev_timestamp'] = df.groupby(['app','func'])['end_timestamp'].shift(1)
df['time_diff'] = (df['end_timestamp'] - df['prev_timestamp']).dt.total_seconds()
df['cold_start_flag'] = df['time_diff'].apply(lambda x: 1 if pd.isnull(x) or x > 1800 else 0)
df = df.drop(columns=['prev_timestamp','time_diff'])

# ----------------------------
# Feature engineering
# ----------------------------
df['hour'] = df['end_timestamp'].dt.hour
df['dayofweek'] = df['end_timestamp'].dt.dayofweek

# Encode categorical variables
app_enc = LabelEncoder()
func_enc = LabelEncoder()
df['app_enc'] = app_enc.fit_transform(df['app'])
df['func_enc'] = func_enc.fit_transform(df['func'])

# ----------------------------
# Log-transform duration to reduce skew
# ----------------------------
df['duration_log'] = np.log1p(df['duration'])

# ----------------------------
# Sliding window creation
# ----------------------------
WINDOW_SIZE = 5
feature_cols = ['duration_log', 'app_enc', 'func_enc', 'hour', 'dayofweek']
X_windows, y_dur, y_cold = [], [], []

for func_id, func_df in df.groupby('func_enc'):
    func_df = func_df.reset_index(drop=True)
    for i in range(WINDOW_SIZE, len(func_df)):
        window = func_df[feature_cols].iloc[i-WINDOW_SIZE:i].values
        target_duration = func_df['duration_log'].iloc[i]
        target_cold = func_df['cold_start_flag'].iloc[i]

        X_windows.append(window)
        y_dur.append(target_duration)
        y_cold.append(target_cold)

X_windows = np.array(X_windows)
y_dur = np.array(y_dur).reshape(-1,1)
y_cold = np.array(y_cold).reshape(-1,1)

# ----------------------------
# Scale features and target
# ----------------------------
scaler_X = MinMaxScaler()
X_windows[:,:,0] = scaler_X.fit_transform(X_windows[:,:,0])  # only scale duration_log

scaler_y = MinMaxScaler()
y_dur_scaled = scaler_y.fit_transform(y_dur)

# ----------------------------
# Train/test split
# ----------------------------
X_train, X_test, y_train_dur, y_test_dur, y_train_cold, y_test_cold = train_test_split(
    X_windows, y_dur_scaled, y_cold, test_size=0.2, random_state=42, shuffle=True
)

# ----------------------------
# LSTM Model (Multi-output)
# ----------------------------
input_layer = Input(shape=(WINDOW_SIZE, len(feature_cols)))
x = LSTM(64, activation='tanh')(input_layer)
dur_output = Dense(1, name='duration_output')(x)
cold_output = Dense(1, activation='sigmoid', name='cold_output')(x)

model = Model(inputs=input_layer, outputs=[dur_output, cold_output])
model.compile(
    optimizer='adam',
    loss={'duration_output':'mse', 'cold_output':'binary_crossentropy'},
    metrics={'duration_output':'mae', 'cold_output':'accuracy'}
)

# ----------------------------
# Training with early stopping
# ----------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train,
    {'duration_output': y_train_dur, 'cold_output': y_train_cold},
    validation_split=0.1,
    epochs=50,
    batch_size=256,
    callbacks=[early_stop],
    verbose=2
)

# ----------------------------
# Prediction and inverse transform
# ----------------------------
y_pred_dur_scaled, y_pred_cold = model.predict(X_test, batch_size=256)
y_pred_dur = scaler_y.inverse_transform(y_pred_dur_scaled)
y_pred_dur = np.expm1(y_pred_dur)  # inverse log1p
y_test_dur_exp = np.expm1(scaler_y.inverse_transform(y_test_dur))

# ----------------------------
# Sample Output
# ----------------------------
for i in range(10):
    print(f"True Duration: {y_test_dur_exp[i][0]:.3f}, Predicted: {y_pred_dur[i][0]:.3f}, Predicted Cold Start: {int(round(y_pred_cold[i][0]))}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# -------------------------
# Inverse transform predictions & ground truth
# -------------------------
y_pred_dur_inv = scaler_y.inverse_transform(y_pred_dur)
y_true_dur_inv = scaler_y.inverse_transform(y_test_dur)

# Clip negatives (since durations can't be negative)
y_pred_dur_inv = np.clip(y_pred_dur_inv, 0, None)

# Select a sample for plotting (first 200 for clarity)
sample_size = 200
true_sample = y_true_dur_inv[:sample_size].flatten()
pred_sample = y_pred_dur_inv[:sample_size].flatten()

# -------------------------
# Scatter plot: True vs Predicted durations
# -------------------------
plt.figure(figsize=(10,6))
plt.scatter(range(sample_size), true_sample, color='blue', label='True Duration', alpha=0.6)
plt.scatter(range(sample_size), pred_sample, color='red', label='Predicted Duration', alpha=0.6)
plt.title("True vs Predicted Durations (Sample of 200)")
plt.xlabel("Sample Index")
plt.ylabel("Duration (seconds)")
plt.legend()
plt.show()

# -------------------------
# Error distribution
# -------------------------
errors = pred_sample - true_sample
plt.figure(figsize=(8,5))
plt.hist(errors, bins=50, color='orange', alpha=0.7)
plt.title("Prediction Errors Distribution")
plt.xlabel("Prediction Error (Predicted - True)")
plt.ylabel("Frequency")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Ensure y_pred_cold is integer (0/1) and aligned with true labels
y_pred_cold = (y_pred_cold > 0.5).astype(int).flatten()
y_true_cold = y_test_cold[:len(y_pred_cold)].flatten()

# -------------------------
# Confusion matrix
# -------------------------
cm = confusion_matrix(y_true_cold, y_pred_cold, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Cold Start', 'Cold Start'])

plt.figure(figsize=(6,6))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Cold Start Prediction Confusion Matrix")
plt.show()

# -------------------------
# True vs Predicted bar chart
# -------------------------
true_counts = np.bincount(y_true_cold, minlength=2)
pred_counts = np.bincount(y_pred_cold, minlength=2)

labels = ['No Cold Start', 'Cold Start']
x = np.arange(len(labels))

plt.figure(figsize=(6,4))
plt.bar(x-0.2, true_counts, width=0.4, label='True', color='blue')
plt.bar(x+0.2, pred_counts, width=0.4, label='Predicted', color='red')
plt.xticks(x, labels)
plt.ylabel("Counts")
plt.title("Cold Start: True vs Predicted Counts")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# -------------------------
# Cold start metrics
# -------------------------
y_true_cold = y_test_cold[:len(y_pred_cold)].flatten()
y_pred_cold_flat = y_pred_cold.flatten()

cold_accuracy = accuracy_score(y_true_cold, y_pred_cold_flat)
cold_precision = precision_score(y_true_cold, y_pred_cold_flat)
cold_recall = recall_score(y_true_cold, y_pred_cold_flat)
cold_f1 = f1_score(y_true_cold, y_pred_cold_flat)

print("Cold Start Metrics:")
print(f"Accuracy : {cold_accuracy:.4f}")
print(f"Precision: {cold_precision:.4f}")
print(f"Recall   : {cold_recall:.4f}")
print(f"F1-Score : {cold_f1:.4f}")

# -------------------------
# Duration metrics
# -------------------------
y_true_dur = y_true_dur_inv.flatten()
y_pred_dur_final = y_pred_dur_inv.flatten()

mae = mean_absolute_error(y_true_dur, y_pred_dur_final)
mse = mean_squared_error(y_true_dur, y_pred_dur_final)
rmse = np.sqrt(mse)
r2 = r2_score(y_true_dur, y_pred_dur_final)

print("\nDuration Metrics:")
print(f"MAE  : {mae:.4f}")
print(f"MSE  : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R2   : {r2:.4f}")

In [None]:
num_rows = df.shape[0]
print(f"Number of rows in df: {num_rows}")

In [None]:
import zipfile
import os

# Save model as H5
model_path = "lstm_model.h5"
model.save(model_path)

# Create a ZIP file
zip_path = "lstm_model.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    zipf.write(model_path)

print(f"Model saved and zipped at: {os.path.abspath(zip_path)}")