# 01 – Exploratory Data Analysis
Load logs and inspect basic distributions.


In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
df = pd.read_csv('data/model_ready_dataset_g.csv', parse_dates=['Timestamp'])
df.head()

Unnamed: 0,Timestamp,Device Name,Traffic Volume (MB/s),Latency (ms),Bandwidth Allocated (MB/s),Bandwidth Used (MB/s),Date,total_avg_app_traffic,total_peak_app_traffic,total_logins,total_peak_user_usage,Event,Impact,Num_Config_Changes,Congestion Flag
0,2025-06-01 00:00:00,Router_A,27.03,10.0,100,28.11,2025-06-01,333.72,898.38,7,52.54,,,0.0,No
1,2025-06-01 00:00:00,Router_B,23.45,13.31,100,25.34,2025-06-01,333.72,898.38,7,52.54,,,0.0,No
2,2025-06-01 00:00:00,Router_C,21.66,16.97,100,22.5,2025-06-01,333.72,898.38,7,52.54,,,0.0,No
3,2025-06-01 01:00:00,Router_A,22.92,13.99,100,22.08,2025-06-01,333.72,898.38,7,52.54,,,0.0,No
4,2025-06-01 01:00:00,Router_B,24.22,24.95,100,24.01,2025-06-01,333.72,898.38,7,52.54,,,0.0,No


In [24]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

In [25]:
# Handle non-numeric columns - encode 'Impact' column
from sklearn.preprocessing import LabelEncoder
df['Impact'] = df['Impact'].fillna('None')
le_impact = LabelEncoder()
df['Impact_encoded'] = le_impact.fit_transform(df['Impact'])

def create_training_samples(data, hours=12):
    #data = data.sort_values('Timestamp')
    samples = []
    targets = []

    unique_times = data['Timestamp'].unique()

    for i in range(hours, len(unique_times)):
        window_start = unique_times[i - hours]
        window_end = unique_times[i]
        target_time = unique_times[i]

        # Get data for the window (last 12 hours) for all routers at each timestamp
        window_data = data[(data['Timestamp'] >= window_start) & (data['Timestamp'] < window_end)]

        # Pivot or reshape window data to create features:
        # We keep 'Router_A', 'Router_B', 'Router_C' separate features for each metric and timestamp

        # We'll pivot on Timestamp and Device Name to organize features:
        # For simplicity, flatten the features by concatenating timestamp and router info
        
        # Create a feature vector for this window
        features = []
        # For each timestamp in the window (sorted)
        time_indexed = sorted(window_data['Timestamp'].unique())
        for ts in time_indexed:
            ts_data = window_data[window_data['Timestamp'] == ts]
            for router in ['Router_A', 'Router_B', 'Router_C']:
                router_data = ts_data[ts_data['Device Name'] == router]
                if not router_data.empty:
                    row = router_data.iloc[0]
                    # Choose features: Traffic Volume, Latency, Bandwidth Used, Bandwidth Allocated
                    features.extend([
                        row['Traffic Volume (MB/s)'],
                        row['Latency (ms)'],
                        row['Bandwidth Used (MB/s)'],
                        row['Bandwidth Allocated (MB/s)'],
                        row['total_avg_app_traffic'],
                        row['total_peak_app_traffic'],
                        row['Impact_encoded'],
                        row['total_peak_user_usage'],
                        row['total_logins']
                    ])
                else:
                    # if data missing for router in this timestamp, fill with zeros or np.nan then fill with 0
                    features.extend([0,0,0,0,0,0,0,0,0])
        
        # Output is congestion probability for routers at target_time
        target_data = data[data['Timestamp'] == target_time]
        # Get congestion labels for each router (1 if congestion else 0)
        congestion_flags = []
        for router in ['Router_A', 'Router_B', 'Router_C']:
            cdata = target_data[target_data['Device Name'] == router]
            if not cdata.empty:
                flag = 1 if cdata.iloc[0]['Congestion Flag'] == 'Yes' else 0
            else:
                flag = 0
            congestion_flags.append(flag)
        
        samples.append(features)
        targets.append(congestion_flags)
    
    X = np.array(samples)
    y = np.array(targets)

    return X, y


In [26]:
from sklearn.metrics import accuracy_score, classification_report
# Use data from last N hours available in dataset for training
X, y = create_training_samples(df, hours=12)

# y has shape (samples, 3), for routers A, B, C congestion labels
# We will train 3 independent XGBClassifier models - one per router

models = {}
proba_preds = {}

for idx, router in enumerate(['Router_A', 'Router_B', 'Router_C']):
    y_router = y[:, idx]
    X_train, X_val, y_train, y_val = train_test_split(X, y_router, test_size=0.2, random_state=42)

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Save model
    models[router] = model

    # Get predictions and probabilities
    y_pred = model.predict(X_val)
    proba = model.predict_proba(X_val)[:, 1]
    proba_preds[router] = proba
    
    # Calculate and store accuracy
    accuracy = accuracy_score(y_val, y_pred)

    


# Print overall summary



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [27]:
def predict_congestion_proba(df, target_timestamp):
    #df = df.sort_values('Timestamp')
    # Get data for last 12 hours before target_timestamp (not including target_timestamp)
    #target_timestamp = target_timestamp + pd.Timedelta(hours=1)
    window_start = target_timestamp - pd.Timedelta(hours=12)
    window_end = target_timestamp

    window_data = df[(df['Timestamp'] >= window_start) & (df['Timestamp'] < window_end)]

    features = []
    time_indexed = sorted(window_data['Timestamp'].unique())
    for ts in time_indexed:
        ts_data = window_data[window_data['Timestamp'] == ts]
        for router in ['Router_A', 'Router_B', 'Router_C']:
            router_data = ts_data[ts_data['Device Name'] == router]
            if not router_data.empty:
                row = router_data.iloc[0]
                features.extend([
                    row['Traffic Volume (MB/s)'],
                    row['Latency (ms)'],
                    row['Bandwidth Used (MB/s)'],
                    row['Bandwidth Allocated (MB/s)'],
                    row['total_avg_app_traffic'],
                    row['total_peak_app_traffic'],
                    row['Impact_encoded'],
                    row['total_peak_user_usage'],
                    row['total_logins']
                ])
            else:
                features.extend([0,0,0,0,0,0,0,0,0])

    features = np.array(features).reshape(1, -1)

    proba_results = {}
    for router in ['Router_A', 'Router_B', 'Router_C']:
        proba = models[router].predict_proba(features)[0, 1]
        proba_results[router] = proba

    return proba_results

#Example usage:
#target_time = pd.Timestamp("2024-04-30 12:00:00")
#print(predict_congestion_proba(df, target_time))

In [29]:
def bandwidth_recommendation(window_data, congestion_probs):
    """
    Recommends bandwidth adjustments based on window data and congestion probabilities.
    
    Parameters:
    - window_ DataFrame containing the last 12 hours of data used for prediction
    - congestion_probs: Dict with congestion probabilities for each router
                       e.g., {'Router_A': 0.85, 'Router_B': 0.3, 'Router_C': 0.15}
    
    Returns:
    - Dict with recommendations in format:
      {
          'Router_A': {
              'action': 'increase_bandwidth',
              'amount': 25.5,  # MB/s change (+ for increase, - for decrease)
              'reason': 'High congestion probability (0.85)'
          }
      }
    """
    
    recommendations = {}
    
    for router in ['Router_A', 'Router_B', 'Router_C']:
        # Get router-specific data from the window
        router_data = window_data[window_data['Device Name'] == router]
        
        if router_data.empty:
            recommendations[router] = {
                'action': 'monitor',
                'amount': 0,
                'reason': 'No data available in window'
            }
            continue
        
        # Calculate key metrics
        congestion_prob = congestion_probs.get(router, 0.0)
        
        # Get most recent values (last hour average)
        current_allocated = router_data['Bandwidth Allocated (MB/s)'].mean()
        current_used = router_data['Bandwidth Used (MB/s)'].mean()
        avg_latency = router_data['Latency (ms)'].mean()
        avg_traffic = router_data['Traffic Volume (MB/s)'].mean()
        
        # Calculate utilization percentage
        utilization = (current_used / current_allocated) if current_allocated > 0 else 0
        
        # Decision logic based on congestion probability and utilization
        if congestion_prob >= 0.8:
            # CRITICAL: High congestion risk
            if utilization >= 0.9:
                # Very high utilization + high congestion risk
                amount = min(current_allocated * 0.4, 50)  # Increase by 40% or max 50 MB/s
                action = 'increase_bandwidth'
                reason = f'CRITICAL: High congestion probability ({congestion_prob:.2f}) with {utilization:.1%} utilization'
            else:
                # High congestion risk but moderate utilization
                amount = current_allocated * 0.25  # Increase by 25%
                action = 'increase_bandwidth'
                reason = f'HIGH RISK: Congestion probability ({congestion_prob:.2f}) requires bandwidth increase'
                
        elif congestion_prob >= 0.6:
            # MODERATE: Medium-high congestion risk
            if utilization >= 0.8:
                amount = current_allocated * 0.2  # Increase by 20%
                action = 'increase_bandwidth'
                reason = f'MODERATE RISK: Congestion probability ({congestion_prob:.2f}) with high utilization ({utilization:.1%})'
            elif avg_latency > 60:  # High latency indicator
                amount = current_allocated * 0.15  # Increase by 15%
                action = 'increase_bandwidth'
                reason = f'LATENCY CONCERN: High latency ({avg_latency:.1f}ms) with congestion risk ({congestion_prob:.2f})'
            else:
                amount = 0
                action = 'monitor_closely'
                reason = f'WATCH: Medium congestion risk ({congestion_prob:.2f}) - monitor for changes'
                
        elif congestion_prob >= 0.4:
            # LOW-MODERATE: Some risk
            if utilization >= 0.85:
                amount = current_allocated * 0.1  # Increase by 10%
                action = 'increase_bandwidth'
                reason = f'PREVENTIVE: High utilization ({utilization:.1%}) with moderate risk ({congestion_prob:.2f})'
            else:
                amount = 0
                action = 'monitor'
                reason = f'NORMAL: Moderate risk ({congestion_prob:.2f}) within acceptable range'
                
        elif congestion_prob <= 0.2:
            # LOW: Very low congestion risk
            if utilization <= 0.4:
                # Low utilization and low risk - consider reducing
                amount = -min(current_allocated * 0.15, 20)  # Decrease by 15% or max 20 MB/s
                action = 'decrease_bandwidth'
                reason = f'OPTIMIZE: Low utilization ({utilization:.1%}) and low risk ({congestion_prob:.2f})'
            elif utilization <= 0.6:
                amount = 0
                action = 'maintain'
                reason = f'EFFICIENT: Good utilization ({utilization:.1%}) with low risk ({congestion_prob:.2f})'
            else:
                amount = 0
                action = 'monitor'
                reason = f'STABLE: Acceptable utilization ({utilization:.1%}) with low risk'
        else:
            # DEFAULT: Normal range (0.2 - 0.4)
            if utilization >= 0.8:
                amount = current_allocated * 0.1  # Small increase
                action = 'increase_bandwidth'
                reason = f'PREVENTIVE: High utilization ({utilization:.1%}) requires small increase'
            else:
                amount = 0
                action = 'maintain'
                reason = f'NORMAL: Balanced operation with {congestion_prob:.2f} risk and {utilization:.1%} utilization'
        
        # Round the amount to 1 decimal place
        amount = round(amount, 1)
        
        recommendations[router] = {
            'action': action,
            'amount': amount,
            'reason': reason
        }
    
    return recommendations

# Usage example:
def get_bandwidth_recommendations(df, target_time, congestion_probs):
    """
    Complete workflow to get bandwidth recommendations
    """
    # Get window data (last 12 hours before target time)
    window_start = target_time - pd.Timedelta(hours=12)
    window_end = target_time
    window_data = df[(df['Timestamp'] >= window_start) & (df['Timestamp'] < window_end)]
    
    # Get recommendations
    recommendations = bandwidth_recommendation(window_data, congestion_probs)
    
    return recommendations

# Example usage:
target_time = pd.Timestamp("2025-06-04 01:00:00")
congestion_probs = predict_congestion_proba(df, target_time)
recommendations = get_bandwidth_recommendations(df, target_time, congestion_probs)

# Display results
print("BANDWIDTH RECOMMENDATIONS:")
print("=" * 50)
for router, rec in recommendations.items():
    print(f"{router}:")
    print(f"  Action: {rec['action']}")
    if rec['amount'] != 0:
        sign = "+" if rec['amount'] > 0 else ""
        print(f"  Change: {sign}{rec['amount']} MB/s")
    print(f"  Reason: {rec['reason']}")
    print()


BANDWIDTH RECOMMENDATIONS:
Router_A:
  Action: decrease_bandwidth
  Change: -15.0 MB/s
  Reason: OPTIMIZE: Low utilization (35.5%) and low risk (0.00)

Router_B:
  Action: maintain
  Reason: EFFICIENT: Good utilization (43.8%) with low risk (0.00)

Router_C:
  Action: maintain
  Reason: EFFICIENT: Good utilization (50.5%) with low risk (0.00)



In [30]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for all 3 models
def get_model_accuracies(models, X, y):
    accuracies = {}
    
    for idx, router in enumerate(['Router_A', 'Router_B', 'Router_C']):
        y_router = y[:, idx]
        # Use same split as used during training
        X_train, X_val, y_train, y_val = train_test_split(X, y_router, test_size=0.2, random_state=42)
        
        # Get binary predictions from the trained model
        y_pred = models[router].predict(X_val)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        accuracies[router] = accuracy
        
        print(f"{router} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    return accuracies

# Get accuracies for all models
model_accuracies = get_model_accuracies(models, X, y)

# Print summary
print("\n" + "="*50)
print("MODEL ACCURACY SUMMARY")
print("="*50)
for router, acc in model_accuracies.items():
    print(f"{router:10}: {acc:.4f} ({acc*100:.2f}%)")

# Optional: Calculate average accuracy across all routers
avg_accuracy = np.mean(list(model_accuracies.values()))
print(f"{'Average':10}: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)")


Router_A Accuracy: 0.9286 (92.86%)
Router_B Accuracy: 0.8571 (85.71%)
Router_C Accuracy: 0.9000 (90.00%)

MODEL ACCURACY SUMMARY
Router_A  : 0.9286 (92.86%)
Router_B  : 0.8571 (85.71%)
Router_C  : 0.9000 (90.00%)
Average   : 0.8952 (89.52%)


In [31]:
from sklearn.metrics import brier_score_loss

def get_model_brier_score(models, X, y):
    brier_scores = {}
    
    for idx, router in enumerate(['Router_A', 'Router_B', 'Router_C']):
        y_router = y[:, idx]
        X_train, X_val, y_train, y_val = train_test_split(X, y_router, test_size=0.1, random_state=42)
        
        y_pred_proba = models[router].predict_proba(X_val)[:, 1]
        
        # Brier Score (lower is better, range 0-1)
        brier = brier_score_loss(y_val, y_pred_proba)
        brier_scores[router] = brier
        
        print(f"{router} Brier Score: {brier:.4f}")
    
    return brier_scores

get_model_brier_score(models, X, y)


Router_A Brier Score: 0.0476
Router_B Brier Score: 0.1091
Router_C Brier Score: 0.0347


{'Router_A': 0.047633019731645604,
 'Router_B': 0.10911783465568828,
 'Router_C': 0.03470525539417394}