In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("network_traffic_unified.csv")  # Replace with your actual filename

# Drop byte-related features
df = df.drop(columns=['orig_bytes', 'resp_bytes']) #exclude continuous features

# Convert all non-duration columns to categorical
for col in df.columns:
    if col != 'duration':
        df[col] = df[col].astype('category')

# ----------------------------
# 🔹 Basic Info
# ----------------------------
print(df.info())
print("\n--- Value Counts for Categorical Features ---")
for col in df.select_dtypes('category').columns:
    print(f"\n{col}:\n{df[col].value_counts()}")

# ----------------------------
# 🔸 Count Plots for Categorical Features
# ----------------------------
for col in df.select_dtypes('category').columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df)
    plt.title(f"Count Plot - {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# ----------------------------
# 🔸 Box/Violin Plots: Duration by Category
# ----------------------------
for col in df.select_dtypes('category').columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=col, y='duration', data=df)
    plt.title(f"Box Plot - Duration by {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.violinplot(x=col, y='duration', data=df)
    plt.title(f"Violin Plot - Duration by {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



# ----------------------------
# 🔹 Grouped Duration Stats
# ----------------------------
for col in df.select_dtypes('category').columns:
    print(f"\n--- Duration stats grouped by {col} ---")
    print(df.groupby(col)['duration'].describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3245180 entries, 0 to 3245179
Data columns (total 5 columns):
 #   Column      Dtype   
---  ------      -----   
 0   proto       category
 1   service     category
 2   duration    float64 
 3   conn_state  category
 4   label       category
dtypes: category(4), float64(1)
memory usage: 37.1 MB
None

--- Value Counts for Categorical Features ---

proto:
proto
2    3241477
4       2974
5        388
0        241
3         53
1         47
Name: count, dtype: int64

service:
service
1    3244181
0        999
Name: count, dtype: int64

conn_state:
conn_state
0    3245180
Name: count, dtype: int64

label:
label
1    3240656
0       4524
Name: count, dtype: int64
