In [24]:
import pandas as pd

In [25]:
import pandas as pd

# Feature functions
def count_packet(df):
    return len(df)

def total_length(df):
    return df["Length"].sum()

def session_duration(df):
    return df["Time"].max() - df["Time"].min()

def packets_per_second(df):
    dur = session_duration(df)
    return len(df) / dur if dur > 0 else 0

def bytes_per_second(df):
    dur = session_duration(df)
    return df["Length"].sum() / dur if dur > 0 else 0

def compute_intervals(df):
    df = df.sort_values("Time").reset_index(drop=True)
    intervals = df["Time"].diff().dropna()  # Drop the first NaN
    return intervals

def average_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.mean()

def max_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.max()

def min_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.min()

def std_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.std()

# Normalized time features (per session z-score normalization)
def zscore_normalized_interval_stats(df):
    intervals = compute_intervals(df)
    if len(intervals) < 2 or intervals.std() == 0:
        return 0, 0, 0  # Avoid divide-by-zero or underflow
    z = (intervals - intervals.mean()) / intervals.std()
    return z.mean(), z.max(), z.min()

def average_packet_length(df):
    return df["Length"].mean()

def max_packet_length(df):
    return df["Length"].max()

def min_packet_length(df):
    return df["Length"].min()

def most_common_packet_length(df):
    return df["Length"].mode().iloc[0] if not df["Length"].mode().empty else None

# Feature collection
def get_results(df, label="ChatGPT"):
    norm_avg, norm_max, norm_min = zscore_normalized_interval_stats(df)
    return {
        "Packet Count": count_packet(df),
        "Pkts_per_sec": packets_per_second(df),
        "Total Length": total_length(df),
        "Bytes_per_sec": bytes_per_second(df),
        # "Avg Interval (s)": average_packet_interval(df),
        # "Max Interval (s)": max_packet_interval(df),
        # "Min Interval (s)": min_packet_interval(df),
        # "Interval Std (s)": std_packet_interval(df),
        "Norm Avg Interval": norm_avg,
        "Norm Max Interval": norm_max,
        "Norm Min Interval": norm_min,
        "Avg Length (bytes)": average_packet_length(df),
        "Max Length (bytes)": max_packet_length(df),
        "Min Length (bytes)": min_packet_length(df),
        "Most Common Length (bytes)": most_common_packet_length(df),
        "Label": label
    }

# Batch processing
def process_files(file_prefix="chatGPTdata", file_range=range(1, 31)):
    total_df = pd.DataFrame()
    for i in file_range:
        df = pd.read_csv(f"data/{file_prefix}{i}.csv")
        results = get_results(df)
        total_df = pd.concat([total_df, pd.DataFrame([results])], ignore_index=True)
    return total_df

# Run

summary_df = process_files()
print(summary_df)
summary_df.to_csv("chatGPT_summary.csv", index=False)

    

    Packet Count  Pkts_per_sec  Total Length  Bytes_per_sec  \
0           1157     26.156207        905918   20480.015901   
1           1369     26.597724       1009091   19605.203517   
2           2057     14.180938       1439098    9921.127786   
3            659     14.258249        421478    9119.177953   
4           1582     11.849449        847846    6350.510685   
5            412     15.297863        253578    9415.537363   
6           1502     43.415981       1157428   33455.973363   
7           1587     22.111717       1087830   15156.766637   
8           1982     58.136668        973849   28565.255374   
9            512     12.064895        304259    7169.634375   
10           927     50.267123        762014   41320.659945   
11          3537     77.807429       1481954   32600.234975   
12          1104     29.953383        858374   23289.134820   
13          1313     30.366261        953966   22062.742431   
14          1042     54.812982        832833   43810.03

In [22]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load CSVs
csv1 = pd.read_csv('chatGPT_summary.csv')
csv2 = pd.read_csv('linkedin_summary.csv')
csv3 = pd.read_csv('reddit_summary.csv')
csv4 = pd.read_csv('wikipedia_summary_updated.csv')

# Ensure data types are consistent
csv3['Min Interval (s)'] = csv3['Min Interval (s)'].astype(float)

# Combine and shuffle
combined_df = pd.concat([csv1, csv2, csv3, csv4], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=123).reset_index(drop=True)

# Stage 1: split off 60% training
train_df, temp_df = train_test_split(combined_df, test_size=0.4, random_state=42)

# Stage 2: split remaining 40% into 50/50 for validation and test (20% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optional: ensure correct dtypes again
for df in [train_df, val_df, test_df]:
    df['Min Interval (s)'] = df['Min Interval (s)'].astype(float)

# Save to CSV
train_df.to_csv('train.csv', index=False)
val_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Confirm output shapes
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")
print(train_df)


Train shape: (57, 13)
Validation shape: (19, 13)
Test shape: (19, 13)
    Packet Count  Total Length  Norm Avg Interval  Norm Max Interval  \
34          1104        858374       8.488979e-17          29.308693   
7            859        604953                NaN                NaN   
53           957        812437                NaN                NaN   
27           512        304259      -1.906498e-17          17.156323   
19          3844       2591246                NaN                NaN   
77          2222       1434237                NaN                NaN   
25          1152        617800       9.766297e-18          26.764308   
69          1587       1087830       1.423830e-16          23.994177   
13          1058        698412                NaN                NaN   
24          1825       1616505                NaN                NaN   
3           2384       1226273                NaN                NaN   
17          2277       1823145                NaN                N