In [8]:
import pandas as pd

In [12]:
import pandas as pd

# Feature functions
def count_packet(df):
    return len(df)

def total_length(df):
    return df["Length"].sum()

def compute_intervals(df):
    df = df.sort_values("Time").reset_index(drop=True)
    intervals = df["Time"].diff().dropna()  # Drop the first NaN
    return intervals

def average_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.mean()

def max_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.max()

def min_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.min()

def std_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.std()

# Normalized time features (per session z-score normalization)
def zscore_normalized_interval_stats(df):
    intervals = compute_intervals(df)
    if len(intervals) < 2 or intervals.std() == 0:
        return 0, 0, 0  # Avoid divide-by-zero or underflow
    z = (intervals - intervals.mean()) / intervals.std()
    return z.mean(), z.max(), z.min()

def average_packet_length(df):
    return df["Length"].mean()

def max_packet_length(df):
    return df["Length"].max()

def min_packet_length(df):
    return df["Length"].min()

def most_common_packet_length(df):
    return df["Length"].mode().iloc[0] if not df["Length"].mode().empty else None

# Feature collection
def get_results(df, label="ChatGPT"):
    norm_avg, norm_max, norm_min = zscore_normalized_interval_stats(df)
    return {
        "Packet Count": count_packet(df),
        "Total Length": total_length(df),
        "Avg Interval (s)": average_packet_interval(df),
        "Max Interval (s)": max_packet_interval(df),
        "Min Interval (s)": min_packet_interval(df),
        "Interval Std (s)": std_packet_interval(df),
        "Norm Avg Interval": norm_avg,
        "Norm Max Interval": norm_max,
        "Norm Min Interval": norm_min,
        "Avg Length (bytes)": average_packet_length(df),
        "Max Length (bytes)": max_packet_length(df),
        "Min Length (bytes)": min_packet_length(df),
        "Most Common Length (bytes)": most_common_packet_length(df),
        "Label": label
    }

# Batch processing
def process_files(file_prefix="chatGPTdata", file_range=range(1, 31)):
    total_df = pd.DataFrame()
    for i in file_range:
        df = pd.read_csv(f"data/{file_prefix}{i}.csv")
        results = get_results(df)
        total_df = pd.concat([total_df, pd.DataFrame([results])], ignore_index=True)
    return total_df

# Run

summary_df = process_files()
print(summary_df)
summary_df.to_csv("chatGPT_summary.csv", index=False)

    

    Packet Count  Total Length  Avg Interval (s)  Max Interval (s)  \
0           1157        905918          0.038265          4.988440   
1           1369       1009091          0.037625          9.353463   
2           2057       1439098          0.070551         15.034570   
3            659        421478          0.070241          5.140540   
4           1582        847846          0.084445          5.099727   
5            412        253578          0.065528          2.516216   
6           1502       1157428          0.023048          3.213679   
7           1587       1087830          0.045253          6.565021   
8           1982        973849          0.017210          1.455924   
9            512        304259          0.083047         11.884945   
10           927        762014          0.019915          4.614494   
11          3537       1481954          0.012856          2.824202   
12          1104        858374          0.033415         10.239505   
13          1313    

In [13]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load CSVs
csv1 = pd.read_csv('chatGPT_summary.csv')
csv2 = pd.read_csv('linkedin_summary.csv')
csv3 = pd.read_csv('reddit_summary.csv')
csv4 = pd.read_csv('wikipedia_summary_updated.csv')

# Ensure data types are consistent
csv3['Min Interval (s)'] = csv3['Min Interval (s)'].astype(float)

# Combine and shuffle
combined_df = pd.concat([csv1, csv2, csv3, csv4], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=123).reset_index(drop=True)

# Stage 1: split off 60% training
train_df, temp_df = train_test_split(combined_df, test_size=0.4, random_state=42)

# Stage 2: split remaining 40% into 50/50 for validation and test (20% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optional: ensure correct dtypes again
for df in [train_df, val_df, test_df]:
    df['Min Interval (s)'] = df['Min Interval (s)'].astype(float)

# Save to CSV
train_df.to_csv('train.csv', index=False)
val_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Confirm output shapes
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")


Train shape: (57, 14)
Validation shape: (19, 14)
Test shape: (19, 14)
