In [None]:
import pandas as pd

df = pd.read_csv("azurefunctions-accesses-2020.csv.bz2", compression='bz2')
print(df.head())

In [None]:
# ----------------------------
# 1. Suppress TF logs & initialize GPU
# ----------------------------
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # suppress INFO/WARNING/ERROR

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
for gpu in physical_devices:
    tf.config.experimental.set_memory_growth(gpu, True)

# ----------------------------
# 2. Import Libraries
# ----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# ----------------------------
# 3. Load Dataset
# ----------------------------
df = pd.read_csv("/kaggle/input/azure-functions/azurefunctions-accesses-2020.csv")
print("Columns:", df.columns)
print(df.head())

# ----------------------------
# 4. Preprocessing
# ----------------------------
# Convert timestamp to datetime (milliseconds)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
df = df.set_index('Timestamp')

# Resample per minute: count number of invocations
workload = df.resample('1min').size().to_frame(name='invocations')

# Fill missing minutes with 0
workload = workload.fillna(0)

# Plot workload trend
plt.figure(figsize=(12,4))
workload['invocations'].plot(title="Azure Functions Invocations per Minute")
plt.show()

# ----------------------------
# 5. Normalization
# ----------------------------
scaler = MinMaxScaler()
values = scaler.fit_transform(workload['invocations'].values.reshape(-1,1))

# ----------------------------
# 6. Train/Test Split (80/20)
# ----------------------------
train_size = int(len(values) * 0.8)
train, test = values[:train_size], values[train_size:]

# ----------------------------
# 7. Sequence Creation
# ----------------------------
def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

SEQ_LEN = 60  # past 60 minutes -> predict next minute
X_train, y_train = create_sequences(train, SEQ_LEN)
X_test, y_test = create_sequences(test, SEQ_LEN)

# ----------------------------
# 8. Reshape for LSTM
# ----------------------------
# LSTM expects input: (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

# ----------------------------
# 9. Build LSTM Base Model
# ----------------------------
model = Sequential([
    LSTM(64, input_shape=(SEQ_LEN,1), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1)  # output: predicted invocations
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# ----------------------------
# 10. Train Model
# ----------------------------
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

# ----------------------------
# 11. Evaluate Model
# ----------------------------
y_pred = model.predict(X_test)

plt.figure(figsize=(12,6))
plt.plot(y_test[:200], label="Actual")
plt.plot(y_pred[:200], label="Predicted")
plt.title("Azure Functions - LSTM Base Model Prediction")
plt.legend()
plt.show()

# ----------------------------
# 12. Save Base Model
# ----------------------------
model.save("lstm_base_model.h5")
print("Base model saved as lstm_base_model.h5")


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Load Azure Functions dataset
DATA_PATH = '/kaggle/input/azure-functions/azurefunctions-accesses-2020.csv'  # Update if your path differs
df = pd.read_csv(DATA_PATH)

# Verify dataset columns
print("Dataset columns:", df.columns.tolist())
print("Sample data:\n", df.head())

# Preprocessing function (Updated for actual columns)
def preprocess_data(df, function_id, sequence_length=10, target_column='count'):
    """
    Preprocess Azure Functions data for a specific function ID.
    Derives 'count' as number of invocations per time window.
    Args:
        df: DataFrame with Azure Functions data
        function_id: ID from 'AnonAppName' (e.g., '9gti3olh')
        sequence_length: Number of time steps in each LSTM sequence
        target_column: Derived column to predict (e.g., 'count' for invocations)
    Returns:
        sequences: Input sequences for LSTM
        targets: Target values (next invocation count)
        scaler: MinMaxScaler for inverse transformation
    """
    # Filter for specific function
    df_func = df[df['AnonAppName'] == function_id].sort_values(by='Timestamp')
    
    # Convert Timestamp to datetime (milliseconds)
    df_func['Timestamp'] = pd.to_datetime(df_func['Timestamp'], unit='ms')
    
    # Aggregate to 1-minute windows: count invocations, fill missing with 0
    df_func = df_func.set_index('Timestamp').resample('1min').agg({
        'AnonFunctionInvocationId': 'count'  # Invocation count per minute
    }).rename(columns={'AnonFunctionInvocationId': 'count'}).fillna(0).reset_index()
    
    # Normalize data
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df_func[[target_column]])
    
    # Create sequences
    sequences = []
    targets = []
    for i in range(len(data_scaled) - sequence_length):
        sequences.append(data_scaled[i:i + sequence_length])
        targets.append(data_scaled[i + sequence_length, 0])
    
    return np.array(sequences), np.array(targets), scaler

# Custom Dataset for LSTM
class WorkloadDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Training function
def train_lstm(model, train_loader, criterion, optimizer, num_epochs=10, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    model.train()
    losses = []
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        for sequences, targets in train_loader:
            sequences, targets = sequences.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(train_loader)
        losses.append(avg_loss)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
    
    return losses

# Main execution
if __name__ == "__main__":
    # Select a function ID from 'AnonAppName' (use first unique for testing)
    unique_functions = df['AnonAppName'].unique()
    function_id = unique_functions[0]  # e.g., '9gti3olh'
    print(f"Available functions: {unique_functions[:5]}...")  # Show first 5
    print(f"Training LSTM for function: {function_id}")
    
    # Preprocess data
    sequences, targets, scaler = preprocess_data(df, function_id=function_id, sequence_length=10, target_column='count')
    print(f"Processed data shape: Sequences {sequences.shape}, Targets {targets.shape}")
    
    # Create dataset and dataloader
    dataset = WorkloadDataset(sequences, targets)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Initialize model, criterion, optimizer
    model = LSTMModel(input_size=1, hidden_size=64, num_layers=2, output_size=1)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Train model
    losses = train_lstm(model, train_loader, criterion, optimizer, num_epochs=10)
    
    # Save model
    torch.save(model.state_dict(), '/kaggle/working/lstm_model.pth')
    print("Model saved to /kaggle/working/lstm_model.pth")
    
    # Plot training loss
    plt.figure(figsize=(8, 4))
    plt.plot(losses)
    plt.title('LSTM Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()
    
    # Test prediction
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    test_sequence = torch.tensor(sequences[-1:], dtype=torch.float32).to(device)
    with torch.no_grad():
        pred_scaled = model(test_sequence).cpu().numpy()
    pred_original = scaler.inverse_transform(pred_scaled.reshape(-1, 1))[0, 0]
    print(f'Predicted next invocations (original scale): {pred_original:.2f}')