In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [2]:
folder_path = '/kaggle/input/cicids2017'
file_list = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
merged_df = pd.concat([pd.read_csv(os.path.join(folder_path, file)).assign(SOURCE_FILE = file) for file in file_list], ignore_index = True)
merged_df.head(5)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,SOURCE_FILE
0,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-PortScan.pcap_IS...
1,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-PortScan.pcap_IS...
2,22,160,1,1,0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-PortScan.pcap_IS...
3,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-PortScan.pcap_IS...
4,35396,77,1,2,0,0,0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-PortScan.pcap_IS...


In [3]:
merged_df.shape

(2830743, 80)

In [4]:
merged_df.columns = merged_df.columns.str.strip()
merged_df.columns = merged_df.columns.str.upper()
merged_df.columns = merged_df.columns.str.replace(' ', '_')
merged_df.columns

Index(['DESTINATION_PORT', 'FLOW_DURATION', 'TOTAL_FWD_PACKETS',
       'TOTAL_BACKWARD_PACKETS', 'TOTAL_LENGTH_OF_FWD_PACKETS',
       'TOTAL_LENGTH_OF_BWD_PACKETS', 'FWD_PACKET_LENGTH_MAX',
       'FWD_PACKET_LENGTH_MIN', 'FWD_PACKET_LENGTH_MEAN',
       'FWD_PACKET_LENGTH_STD', 'BWD_PACKET_LENGTH_MAX',
       'BWD_PACKET_LENGTH_MIN', 'BWD_PACKET_LENGTH_MEAN',
       'BWD_PACKET_LENGTH_STD', 'FLOW_BYTES/S', 'FLOW_PACKETS/S',
       'FLOW_IAT_MEAN', 'FLOW_IAT_STD', 'FLOW_IAT_MAX', 'FLOW_IAT_MIN',
       'FWD_IAT_TOTAL', 'FWD_IAT_MEAN', 'FWD_IAT_STD', 'FWD_IAT_MAX',
       'FWD_IAT_MIN', 'BWD_IAT_TOTAL', 'BWD_IAT_MEAN', 'BWD_IAT_STD',
       'BWD_IAT_MAX', 'BWD_IAT_MIN', 'FWD_PSH_FLAGS', 'BWD_PSH_FLAGS',
       'FWD_URG_FLAGS', 'BWD_URG_FLAGS', 'FWD_HEADER_LENGTH',
       'BWD_HEADER_LENGTH', 'FWD_PACKETS/S', 'BWD_PACKETS/S',
       'MIN_PACKET_LENGTH', 'MAX_PACKET_LENGTH', 'PACKET_LENGTH_MEAN',
       'PACKET_LENGTH_STD', 'PACKET_LENGTH_VARIANCE', 'FIN_FLAG_COUNT',
       'SYN_FLAG_CO

In [5]:
merged_df[['FLOW_BYTES/S', 'FLOW_PACKETS/S']] = merged_df[['FLOW_BYTES/S', 'FLOW_PACKETS/S']].replace([np.inf, -np.inf], np.nan)

In [6]:
merged_df['ANOMALY'] = merged_df['LABEL'].apply(lambda s: 0 if s == "BENIGN" else 1)
y = merged_df['ANOMALY'].values

In [7]:
feature_cols = [c for c in merged_df.columns 
                if c not in ('LABEL', 'SOURCE_FILE', 'ANOMALY')]
X = merged_df[feature_cols].copy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.30,
    stratify = y,
    random_state = 42
)

In [9]:
def profile_dataset_and_assign_tags(X, y):
    n_samples, n_features = X.shape
    anomaly_ratio = np.mean(y)
    missing_ratio = X.isna().sum().sum() / (n_samples * n_features)

    skewness = stats.skew(X, axis = 0, nan_policy = "omit")
    kurtosis = stats.kurtosis(X, axis = 0, nan_policy = "omit")
    avg_skewness = np.nanmean(np.abs(skewness))
    avg_kurtosis = np.nanmean(np.abs(kurtosis))

    tags = []

    # Dimensionality
    if n_features <= 10:
        tags.append("low_dimensional")
    elif n_features <= 100:
        tags.append("medium_dimensional")
    else:
        tags.append("high_dimensional")

    # Sample Size
    if n_samples <= 500:
        tags.append("small_sample")
    elif n_samples <= 5000:
        tags.append("medium_sample")
    else:
        tags.append("large_sample")

    # Imbalance
    if anomaly_ratio < 0.05:
        tags.append("highly_imbalanced")
    elif anomaly_ratio < 0.20:
        tags.append("imbalanced")
    else:
        tags.append("balanced")

    # Noise Estimation
    if avg_skewness > 2.5 or avg_kurtosis > 10:
        tags.append("noisy")
    else:
        tags.append("low_noise")

    # Missing Data
    if missing_ratio > 0.1:
        tags.append("missing_data")

    # Structure assumption (tabular)
    tags.append("structured_data")

    # Build summary table
    metrics_df = pd.DataFrame({
        "Metric": [
            "n_samples", "n_features", "anomaly_ratio",
            "missing_value_ratio", "avg_skewness", "avg_kurtosis"
        ],
        "Value": [
            n_samples, n_features, round(anomaly_ratio, 6),
            round(missing_ratio, 6), round(avg_skewness, 4),
            round(avg_kurtosis, 4)
        ]
    })

    print("Dataset Profiling Summary")
    display(metrics_df)

    print("\n Assigned Tags:", tags)
    return tags, metrics_df

In [10]:
assigned_tags, metrics_df = profile_dataset_and_assign_tags(X_train, y_train)

all_dataset_results = []

all_dataset_results.append({
    "name": "CICIDS2017.csv",
    "tags": assigned_tags,
    "metrics": metrics_df.to_dict(orient = "records")
})

Dataset Profiling Summary


Unnamed: 0,Metric,Value
0,n_samples,1981520.0
1,n_features,78.0
2,anomaly_ratio,0.196996
3,missing_value_ratio,2.6e-05
4,avg_skewness,107.487
5,avg_kurtosis,81598.67



 Assigned Tags: ['medium_dimensional', 'large_sample', 'imbalanced', 'noisy', 'structured_data']
