In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/whos-talking-classify-the-app-by-its-packets/sample_submission.csv
/kaggle/input/whos-talking-classify-the-app-by-its-packets/train.csv
/kaggle/input/whos-talking-classify-the-app-by-its-packets/test.csv


In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform

# --- Configuration ---
QUICK_TEST = False 
N_ITER = 50 if not QUICK_TEST else 5
CV_FOLDS = 5 if not QUICK_TEST else 2
N_JOBS_FOR_SEARCH = 4 # Keep low to avoid OOM
RANDOM_STATE = 42

# Define feature columns
FEATURE_COLS = [f'tcp_len_{i}' for i in range(1, 31)]
TARGET_COL = "app_service"
KAGGLE_INPUT_PATH = '/kaggle/input/whos-talking-classify-the-app-by-its-packets/'

# --- 1. Feature Engineering Functions ---

def process_flow(row):
    """
    Applies the preprocessing logic from the competition description.
    Merges consecutive, same-direction, large (>1200) packets.
    """
    new_flow = []
    flow_data = row.values 
    i = 0
    
    while i < 30:
        packet = flow_data[i]
        if packet == 0:
            break
            
        sign = np.sign(packet)
        size = np.abs(packet)
        
        if size > 1200:
            current_sum = 0
            j = i
            while j < 30:
                next_packet = flow_data[j]
                next_sign = np.sign(next_packet)
                next_size = np.abs(next_packet)
                
                if next_sign == sign and next_size > 1200:
                    current_sum += next_size
                    j += 1
                else:
                    break
            
            new_flow.append(sign * current_sum)
            i = j
        
        else:
            new_flow.append(packet)
            i += 1
            
    padded_flow = new_flow + [0] * (30 - len(new_flow))
    return padded_flow[:30]


def extract_statistical_features(df):
    """
    Inspired by the malware notebook, this function extracts
    high-level statistical features from a flow (row).
    """
    print("Extracting statistical features...")
    # df is the (n_samples, 30) dataframe of packet lengths
    features = pd.DataFrame(index=df.index)
    
    # Replace 0s (padding) with NaN to ignore them in stats
    df_nan = df.replace(0, np.nan)
    
    # --- Overall Stats ---
    features['num_packets'] = df_nan.notna().sum(axis=1)
    features['total_bytes'] = df_nan.abs().sum(axis=1)
    features['avg_packet_size'] = df_nan.abs().mean(axis=1)
    features['std_packet_size'] = df_nan.abs().std(axis=1)
    
    # --- Client Stats (positive values) ---
    client_df = df_nan[df_nan > 0]
    features['num_client_packets'] = client_df.notna().sum(axis=1)
    features['total_client_bytes'] = client_df.sum(axis=1)
    features['avg_client_packet_size'] = client_df.mean(axis=1)
    features['std_client_packet_size'] = client_df.std(axis=1)
    features['max_client_packet'] = client_df.max(axis=1)
    features['min_client_packet'] = client_df.min(axis=1)

    # --- Server Stats (negative values) ---
    server_df = df_nan[df_nan < 0]
    features['num_server_packets'] = server_df.notna().sum(axis=1)
    # .abs() is important for sum/mean
    features['total_server_bytes'] = server_df.abs().sum(axis=1)
    features['avg_server_packet_size'] = server_df.abs().mean(axis=1)
    features['std_server_packet_size'] = server_df.abs().std(axis=1)
    features['max_server_packet'] = server_df.abs().max(axis=1) # Max *size*
    features['min_server_packet'] = server_df.abs().min(axis=1) # Min *size*

    # --- Ratio Stats ---
    # Handle division by zero
    features['ratio_client_server_packets'] = features['num_client_packets'] / (features['num_server_packets'] + 1e-6)
    features['ratio_client_server_bytes'] = features['total_client_bytes'] / (features['total_server_bytes'] + 1e-6)
    
    # --- Simple Sequence Stats ---
    features['first_packet'] = df.iloc[:, 0]
    features['second_packet'] = df.iloc[:, 1]
    
    # Fill all NaNs from stats (e.g., std of 1 packet, avg of 0 packets) with 0
    return features.fillna(0).astype('float32')

# --- 2. Load and Preprocess Training Data ---

print("Script started.")
print("Loading training data...")
try:
    train_df = pd.read_csv(f'{KAGGLE_INPUT_PATH}train.csv', usecols=[TARGET_COL] + FEATURE_COLS)
except FileNotFoundError:
    print("Error: train.csv not found.")
    raise

if QUICK_TEST:
    print("Running in QUICK_TEST mode. Subsampling training data.")
    train_df = train_df.sample(n=50000, random_state=RANDOM_STATE)

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(train_df[TARGET_COL].astype(str))
num_classes = len(le.classes_)

# --- 3. Combine Feature Sets ---

# Feature Set 1: Raw Features
print("1. Processing Raw Features...")
X_raw = train_df[FEATURE_COLS].fillna(0).astype('float32')
# Rename columns to avoid collision
X_raw.columns = [f'raw_{i}' for i in range(1, 31)]

# Feature Set 2: Statistical Features
print("2. Processing Statistical Features...")
X_stats = extract_statistical_features(train_df[FEATURE_COLS])
# Scale statistical features
scaler = StandardScaler()
X_stats = pd.DataFrame(scaler.fit_transform(X_stats), columns=X_stats.columns, index=X_stats.index).astype('float32')

# Feature Set 3: Merged Packet Features
print("3. Processing Merged Packet Features...")
X_merged = train_df[FEATURE_COLS].fillna(0).apply(process_flow, axis=1, result_type='expand')
X_merged.columns = [f'merged_{i}' for i in range(1, 31)]
X_merged = X_merged.astype('float32')

# Clean up original training data to save RAM
print("Cleaning up original train_df...")
del train_df
gc.collect()

# Combine all feature sets
print("Combining all feature sets...")
X = pd.concat([X_raw, X_stats, X_merged], axis=1)

print(f"Total features created: {X.shape[1]}")
print(f"Training data shape: {X.shape}, Memory usage: {X.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Clean up intermediate dataframes
del X_raw, X_stats, X_merged
gc.collect()

# --- 4. Hyperparameter Optimization ---

print("Setting up RandomizedSearchCV...")
lgbm = LGBMClassifier(
    objective="multiclass",
    num_class=num_classes,
    metric="multi_logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1  # Use all 77 cores for training *each* model
)

param_dist = {
    'n_estimators': randint(300, 1200),
    'learning_rate': uniform(0.01, 0.05),
    'num_leaves': randint(31, 61),
    'max_depth': [-1, 20, 30],
    'feature_fraction': uniform(0.5, 0.4), # 0.5 to 0.9
    'bagging_fraction': uniform(0.5, 0.4), # 0.5 to 0.9
    'bagging_freq': randint(1, 7),
    'min_child_samples': randint(20, 50)
}

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=N_ITER,        
    cv=CV_FOLDS,          
    scoring='accuracy',   
    n_jobs=N_JOBS_FOR_SEARCH, # <-- CRITICAL: Low job count for search
    random_state=RANDOM_STATE,
    verbose=2
)

print(f"Starting model training with {N_JOBS_FOR_SEARCH} parallel jobs.")
random_search.fit(X, y)

print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

print("Cleaning up training data...")
del X, y, random_search, lgbm
gc.collect()

# --- 5. Load and Process Test Data ---

print("Loading test data...")
try:
    test_df = pd.read_csv(f'{KAGGLE_INPUT_PATH}test.csv')
except FileNotFoundError:
    print("Error: test.csv not found.")
    raise

test_ids = test_df['id']

print("Applying same feature engineering to test data...")
# Feature Set 1: Raw Features
print("1. Processing Raw Features (Test)...")
X_raw_test = test_df[FEATURE_COLS].fillna(0).astype('float32')
X_raw_test.columns = [f'raw_{i}' for i in range(1, 31)]

# Feature Set 2: Statistical Features
print("2. Processing Statistical Features (Test)...")
X_stats_test = extract_statistical_features(test_df[FEATURE_COLS])
# IMPORTANT: Use the *same scaler* fitted on the training data
X_stats_test = pd.DataFrame(scaler.transform(X_stats_test), columns=X_stats_test.columns, index=X_stats_test.index).astype('float32')

# Feature Set 3: Merged Packet Features
print("3. Processing Merged Packet Features (Test)...")
X_merged_test = test_df[FEATURE_COLS].fillna(0).apply(process_flow, axis=1, result_type='expand')
X_merged_test.columns = [f'merged_{i}' for i in range(1, 31)]
X_merged_test = X_merged_test.astype('float32')

# Clean up original test data
del test_df
gc.collect()

# Combine all feature sets for test data
print("Combining all feature sets (Test)...")
X_test = pd.concat([X_raw_test, X_stats_test, X_merged_test], axis=1)

print(f"Test data shape: {X_test.shape}")
del X_raw_test, X_stats_test, X_merged_test
gc.collect()

# --- 6. Generate Predictions ---

print("Generating predictions on the test set...")
predictions_encoded = best_model.predict(X_test)
predictions = le.inverse_transform(predictions_encoded)

del X_test, best_model, predictions_encoded
gc.collect()

# --- 7. Create Submission File ---

print("Creating submission file...")
submission_df = pd.DataFrame({'id': test_ids, TARGET_COL: predictions})
submission_df.to_csv('submission.csv', index=False)

print("Script finished successfully.")

Script started.
Loading training data...


  train_df = pd.read_csv(f'{KAGGLE_INPUT_PATH}train.csv', usecols=[TARGET_COL] + FEATURE_COLS)


1. Processing Raw Features...
2. Processing Statistical Features...
Extracting statistical features...


  return op(a, b)
  return op(a, b)


3. Processing Merged Packet Features...
Cleaning up original train_df...
Combining all feature sets...
Total features created: 80
Training data shape: (8248546, 80), Memory usage: 2.46 GB
Setting up RandomizedSearchCV...
Starting model training with 4 parallel jobs.
Fitting 5 folds for each of 50 candidates, totalling 250 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}
Detailed tracebacks of the workers should have been printed to stderr in the executor process if faulthandler was not disabled.