# RNP CT-MON Route Change Detection Challenge

**Competition Result: 2nd Place**

## Overview

This notebook presents a machine learning approach to detect network route changes in traceroute data from the RNP (Rede Nacional de Ensino e Pesquisa) CT-MON monitoring system. The challenge involves identifying when network routes between source and destination pairs change based on Round-Trip Time (RTT) measurements and probe statistics.

## Problem Statement

Network route changes can indicate:
- Network failures or reconfigurations
- Load balancing events
- Routing protocol updates

Detecting these changes automatically is crucial for network monitoring and troubleshooting.

## 1. Environment Setup and Data Loading

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import ast
import os

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, roc_curve, accuracy_score, 
    confusion_matrix, classification_report, f1_score
)

# Display all available data files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load training data
df = pd.read_csv("/kaggle/input/data-challenge-2025-rnp/train.csv")

print(f"Training data shape: {df.shape}")
print(f"\nClass distribution:")
print(f"  Route unchanged (0): {(df['route_changed'] == 0).sum():,}")
print(f"  Route changed (1): {(df['route_changed'] == 1).sum():,}")
print(f"  Imbalance ratio: {(df['route_changed'] == 0).sum() / (df['route_changed'] == 1).sum():.2f}:1")

## 2. Data Preprocessing

### 2.1 Utility Functions

We define helper functions for data transformation and feature engineering.

In [None]:
def normalize_rtts(rtt_string):
    """
    Parse and normalize RTT values from string representation.
    
    The 'all_rtts' column contains RTT measurements as string-encoded lists.
    This function converts them to numeric values by computing the mean.
    
    Parameters:
    -----------
    rtt_string : str
        String representation of RTT list (e.g., "[10.5, 11.2, 10.8]")
    
    Returns:
    --------
    float : Mean RTT value, or NaN if parsing fails
    """
    try:
        values = ast.literal_eval(rtt_string)
        if isinstance(values, (list, tuple)):
            return float(np.mean(values))
        else:
            return np.nan
    except Exception:
        return np.nan


def engineer_features(df, route_stats=None):
    """
    Create engineered features from raw traceroute data.
    
    Features created:
    - Temporal: Time between samples, RTT changes over time
    - Statistical: Rolling means, standard deviations, z-scores
    - Network: Probe/reply ratios, sudden change indicators
    - Historical: Route change rates for src-dst pairs
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with raw features
    route_stats : pd.Series, optional
        Historical route change rates (from training data)
    
    Returns:
    --------
    pd.DataFrame : DataFrame with engineered features added
    """
    df = df.copy()
    
    # Temporal features
    df['seconds_since_last_sample'] = (
        df.groupby(['tr_dst', 'tr_src'])['seconds_since_start']
        .diff()
        .fillna(0)
    )
    
    # Normalize RTT arrays to scalar values
    df['norm_rtts'] = df['all_rtts'].apply(normalize_rtts)
    df = df.drop('all_rtts', axis=1)
    
    # RTT difference features
    df['rtts_diff'] = (
        df.groupby(['tr_dst', 'tr_src'])['norm_rtts']
        .diff()
        .abs()
        .fillna(0)
    )
    df['rtts_diff_norm'] = df['rtts_diff'] / (df['norm_rtts'] + 1e-8)
    
    # Rolling statistics (window=5)
    df['rtts_mean_5'] = (
        df.groupby(['tr_src', 'tr_dst'])['norm_rtts']
        .rolling(5, min_periods=1)
        .mean()
        .reset_index(level=[0, 1], drop=True)
    )
    df['rtts_dist_from_mean_5'] = (df['rtts_mean_5'] - df['norm_rtts']).abs()
    
    # Global statistics per route
    df['rtts_mean'] = (
        df.groupby(['tr_src', 'tr_dst'])['norm_rtts']
        .transform('mean')
    )
    df['rtts_std'] = (
        df.groupby(['tr_src', 'tr_dst'])['norm_rtts']
        .transform('std')
    )
    df['rtts_dist_from_mean'] = (df['rtts_mean'] - df['rtts_diff']).abs()
    
    # Z-score normalization
    df['rtts_zscore'] = (
        (df['norm_rtts'] - df['rtts_mean']) / (df['rtts_std'] + 1e-8)
    )
    
    # Rolling standard deviation (window=10) for anomaly detection
    df['rtts_rolling_std'] = (
        df.groupby(['tr_src', 'tr_dst'])['norm_rtts']
        .transform(lambda x: x.rolling(window=10, min_periods=1).std())
    )
    
    # Handle NaN and zero values in rolling std
    df['rtts_rolling_std'] = df['rtts_rolling_std'].fillna(0)
    min_std = df['rtts_rolling_std'][df['rtts_rolling_std'] > 0].min()
    min_std = min_std if pd.notna(min_std) else 0.001
    df['rtts_rolling_std'] = df['rtts_rolling_std'].replace(0, min_std)
    
    # Sudden change indicator (2-sigma rule)
    df['rtts_sudden_change'] = (
        (df['rtts_diff'] > df['rtts_rolling_std'] * 2).astype(int)
    )
    
    # Probe success ratio features
    df['ratio_replies_probes'] = (
        df['total_replies_last_hop'] / (df['total_probes_sent'] + 1e-8)
    )
    
    # Monotonic transformation: values close to 1.0 get higher scores
    df['ratio_dist1'] = np.abs(df['ratio_replies_probes'] - 1)
    df['ratio_mono_dist1'] = 1 / (1 + df['ratio_dist1'])
    
    # Historical route change rate
    if route_stats is not None:
        df['route_change_rate'] = (
            df.set_index(['tr_dst', 'tr_src'])
            .index.map(route_stats)
        )
        # Fill unseen routes with global mean
        df['route_change_rate'] = df['route_change_rate'].fillna(route_stats.mean())
    
    return df

### 2.2 Apply Feature Engineering

We apply feature engineering to the training data and compute historical route change rates.

In [None]:
# Engineer features
df = engineer_features(df)

# Compute historical route change rates for each source-destination pair
route_stats = (
    df.groupby(['tr_dst', 'tr_src'])['route_changed']
    .mean()
    .rename('route_change_rate')
)

# Merge back to dataframe
df = df.merge(route_stats, on=['tr_dst', 'tr_src'])

print(f"Engineered features shape: {df.shape}")
print(f"\nSample of engineered features:")
print(df[[
    'norm_rtts', 'rtts_diff', 'rtts_diff_norm', 
    'rtts_zscore', 'rtts_sudden_change', 'route_change_rate'
]].describe())

## 3. Model Training and Evaluation

### 3.1 Feature Selection

Based on domain knowledge and preliminary analysis, we select the following features:

1. **rtts_diff**: Absolute change in RTT between consecutive measurements
2. **rtts_diff_norm**: Normalized RTT change (relative to current RTT)
3. **seconds_since_last_sample**: Time elapsed between measurements
4. **route_change_rate**: Historical probability of route changes for this path
5. **ratio_mono_dist1**: Probe success ratio (transformed)
6. **rtts_dist_from_mean**: Distance from global RTT mean
7. **rtts_dist_from_mean_5**: Distance from 5-sample rolling mean
8. **rtts_zscore**: Standardized RTT value
9. **rtts_sudden_change**: Binary indicator for 2-sigma anomalies

In [None]:
# Define feature set
features = [
    'rtts_diff_norm',
    'rtts_diff',
    'seconds_since_last_sample',
    'route_change_rate',
    'ratio_mono_dist1',
    'rtts_dist_from_mean',
    'rtts_dist_from_mean_5',
    'rtts_zscore',
    'rtts_sudden_change'
]

X = df[features]
y = df['route_changed']

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

### 3.2 Train-Test Split and Model Training

We use XGBoost with optimized hyperparameters. The parameters were obtained through Bayesian optimization (Optuna) to maximize F1-score.

In [None]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.17, 
    random_state=42, 
    stratify=y
)

# Calculate class weights for imbalanced data
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()
scale_pos_weight = num_neg / num_pos

print(f"Class imbalance: {scale_pos_weight:.2f}")
print(f"Using scale_pos_weight={scale_pos_weight:.2f} to handle imbalance\n")

# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Optimized hyperparameters (obtained via Optuna)
params = {
    'learning_rate': 0.03433162986829156,
    'max_depth': 15,
    'subsample': 0.5621189283541812,
    'colsample_bytree': 0.8666719898854709,
    'min_child_weight': 4,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

# Train model with early stopping
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=100
)

print(f"\nBest iteration: {model.best_iteration}")
print(f"Best score: {model.best_score:.6f}")

### 3.3 Threshold Optimization

For imbalanced classification, the default threshold of 0.5 is often suboptimal. We search for the threshold that maximizes F1-score.

In [None]:
# Get probability predictions
y_pred_prob = model.predict(dtest)

# Find optimal threshold for F1-score
best_f1 = 0
best_thresh = 0.5

thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = []

for thresh in thresholds:
    y_pred_temp = (y_pred_prob > thresh).astype(int)
    f1 = f1_score(y_test, y_pred_temp)
    f1_scores.append(f1)
    
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Optimal threshold: {best_thresh:.3f}")
print(f"Maximum F1-score: {best_f1:.4f}")

# Apply optimal threshold
y_pred = (y_pred_prob > best_thresh).astype(int)

# Plot F1-score vs threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, linewidth=2)
plt.axvline(best_thresh, color='r', linestyle='--', 
            label=f'Optimal threshold = {best_thresh:.3f}')
plt.xlabel('Classification Threshold', fontsize=12)
plt.ylabel('F1-Score', fontsize=12)
plt.title('F1-Score vs Classification Threshold', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 3.4 Model Performance Evaluation

In [None]:
# Comprehensive metrics
print("="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)

print(f"\nAUC-ROC: {roc_auc_score(y_test, y_pred_prob):.6f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.6f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.6f}")

print(f"\nPrediction distribution:")
print(f"  Predicted negatives (0): {(y_pred == 0).sum():,}")
print(f"  Predicted positives (1): {(y_pred == 1).sum():,}")

print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, digits=4))

print("="*60)
print("CONFUSION MATRIX")
print("="*60)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\nTrue Negatives: {cm[0, 0]:,}")
print(f"False Positives: {cm[0, 1]:,}")
print(f"False Negatives: {cm[1, 0]:,}")
print(f"True Positives: {cm[1, 1]:,}")

### 3.5 Feature Importance Analysis

In [None]:
# Extract feature importance
importance = model.get_score(importance_type='gain')
importance_df = pd.DataFrame(
    list(importance.items()), 
    columns=['feature', 'importance']
).sort_values('importance', ascending=False)

print("\nFeature Importance (sorted by gain):")
print(importance_df.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance (Gain)', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('XGBoost Feature Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 3.6 ROC Curve Analysis

In [None]:
# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, linewidth=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', 
          fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Test Set Prediction and Submission

Apply the trained model to the test dataset and generate competition submission.

In [None]:
# Load test data
test = pd.read_csv("/kaggle/input/data-challenge-2025-rnp/test.csv")
print(f"Test data shape: {test.shape}")

# Apply same feature engineering pipeline
test_processed = engineer_features(test, route_stats=route_stats)

# Sort by route and time for consistency
test_processed = test_processed.sort_values(
    ['tr_dst', 'tr_src', 'seconds_since_start']
).reset_index(drop=True)

print(f"Processed test data shape: {test_processed.shape}")

In [None]:
# Generate predictions
dtest_final = xgb.DMatrix(test_processed[features])
test_pred_prob = model.predict(dtest_final)
test_pred = (test_pred_prob > best_thresh).astype(int)

print(f"Test set predictions:")
print(f"  Predicted route unchanged (0): {(test_pred == 0).sum():,}")
print(f"  Predicted route changed (1): {(test_pred == 1).sum():,}")
print(f"  Positive rate: {(test_pred == 1).sum() / len(test_pred):.4%}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_processed['tr_id'],
    'target': test_pred
})

submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file created: /kaggle/working/submission.csv")
print(f"\nSubmission preview:")
print(submission.head(10))

## 5. Summary

The model achieved AUC-ROC of 0.996 and F1-score of 0.73, demonstrating strong performance on this imbalanced classification task. The most important features were RTT absolute changes, deviations from short-term rolling means, and historical route change patterns. Threshold optimization at 0.12 (instead of the default 0.5) significantly improved F1-score for the minority class.