In [24]:
import pandas as pd

In [2]:
import pandas as pd

# Feature functions
def count_packet(df):
    return len(df)

def total_length(df):
    return df["Length"].sum()

def session_duration(df):
    return df["Time"].max() - df["Time"].min()

def packets_per_second(df):
    dur = session_duration(df)
    return len(df) / dur if dur > 0 else 0

def bytes_per_second(df):
    dur = session_duration(df)
    return df["Length"].sum() / dur if dur > 0 else 0

def compute_intervals(df):
    df = df.sort_values("Time").reset_index(drop=True)
    intervals = df["Time"].diff().dropna()  # Drop the first NaN
    return intervals

def average_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.mean()

def max_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.max()

def min_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.min()

def std_packet_interval(df):
    intervals = compute_intervals(df)
    return intervals.std()

# Normalized time features (per session z-score normalization)
def zscore_normalized_interval_stats(df):
    intervals = compute_intervals(df)
    if len(intervals) < 2 or intervals.std() == 0:
        return 0, 0, 0  # Avoid divide-by-zero or underflow
    z = (intervals - intervals.mean()) / intervals.std()
    return z.mean(), z.max(), z.min()

def average_packet_length(df):
    return df["Length"].mean()

def max_packet_length(df):
    return df["Length"].max()

def min_packet_length(df):
    return df["Length"].min()

def most_common_packet_length(df):
    return df["Length"].mode().iloc[0] if not df["Length"].mode().empty else None

# Feature collection
def get_results(df, label="Reddit"):
    norm_avg, norm_max, norm_min = zscore_normalized_interval_stats(df)
    return {
        "Packet Count": count_packet(df),
        "Pkts_per_sec": packets_per_second(df),
        "Total Length": total_length(df),
        "Bytes_per_sec": bytes_per_second(df),
        # "Avg Interval (s)": average_packet_interval(df),
        # "Max Interval (s)": max_packet_interval(df),
        # "Min Interval (s)": min_packet_interval(df),
        # "Interval Std (s)": std_packet_interval(df),
        "Norm Avg Interval": norm_avg,
        "Norm Max Interval": norm_max,
        "Norm Min Interval": norm_min,
        "Avg Length (bytes)": average_packet_length(df),
        "Max Length (bytes)": max_packet_length(df),
        "Min Length (bytes)": min_packet_length(df),
        "Most Common Length (bytes)": most_common_packet_length(df),
        "Label": label
    }

# Batch processing
def process_files(file_prefix="redditData", file_range=range(1, 31)):
    total_df = pd.DataFrame()
    for i in file_range:
        df = pd.read_csv(f"data/{file_prefix}{i}.csv")
        results = get_results(df)
        total_df = pd.concat([total_df, pd.DataFrame([results])], ignore_index=True)
    return total_df

# Run

summary_df = process_files()
print(summary_df)
summary_df.to_csv("reddit_summary.csv", index=False)

    

    Packet Count  Pkts_per_sec  Total Length  Bytes_per_sec  \
0           1438     81.062951        831120   46851.905092   
1           4366    110.994752       4064459  103328.818038   
2           3146    181.102324       2132406  122753.872098   
3            142      3.095422         58731    1280.262136   
4           3769    129.395197       3627954  124552.884493   
5            761     38.677270        507542   25795.451929   
6           2645    116.231840       1685058   74048.163399   
7            572     36.622267        277808   17786.641106   
8           1274     39.917499        797796   24996.876941   
9           2996    110.767925       2064261   76319.729028   
10          1883     73.015913       1506475   58415.638450   
11           793     28.954919        454733   16603.728970   
12           793     28.954919        454733   16603.728970   
13          1564    173.436128       1229550  136348.076107   
14          1238     44.590676        694554   25016.66

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load CSVs
csv1 = pd.read_csv('chatGPT_summary.csv')
csv2 = pd.read_csv('linkedin_summary.csv')
csv3 = pd.read_csv('reddit_summary.csv')
csv4 = pd.read_csv('wikipedia_summary_updated.csv')

# Ensure data types are consistent
csv3['Min Interval (s)'] = csv3['Min Interval (s)'].astype(float)

# Combine and shuffle
combined_df = pd.concat([csv1, csv2, csv3, csv4], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=123).reset_index(drop=True)

# Stage 1: split off 60% training
train_df, temp_df = train_test_split(combined_df, test_size=0.4, random_state=42)

# Stage 2: split remaining 40% into 50/50 for validation and test (20% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optional: ensure correct dtypes again
for df in [train_df, val_df, test_df]:
    df['Min Interval (s)'] = df['Min Interval (s)'].astype(float)

# Save to CSV
train_df.to_csv('train.csv', index=False)
val_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Confirm output shapes
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")
print(train_df)


KeyError: 'Min Interval (s)'