In [23]:
import pandas as pd


In [25]:
# Extract Features
def count_packet(df):
    return len(df)

def total_length(df):
    return df["Length"].sum()

def average_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().mean()

def max_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().max()

def min_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().min()

def average_packet_length(df):
    return df["Length"].mean()

def max_packet_length(df):
    return df["Length"].max()

def min_packet_length(df):
    return df["Length"].min()

def most_common_packet_length(df):
    return df["Length"].mode().iloc[0] if not df["Length"].mode().empty else None


def get_results(df, label="ChatGPT"): # Sign Labels based on actual Website Name
    return {
        "Packet Count": count_packet(df),
        "Total Length": total_length(df),
        "Avg Interval (s)": average_packet_interval(df),
        "Max Interval (s)": max_packet_interval(df),
        "Min Interval (s)": min_packet_interval(df),
        "Avg Length (bytes)": average_packet_length(df),
        "Max Length (bytes)": max_packet_length(df),
        "Min Length (bytes)": min_packet_length(df),
        "Most Common Length (bytes)": most_common_packet_length(df),
        "Label": label
    }

# Build Files 
def process_files(file_prefix="chatGPTdata", file_range=range(1, 21)): # used actual data prefix
    total_df = pd.DataFrame()
    for i in file_range:
        df = pd.read_csv(f"{file_prefix}{i}.csv")
        results = get_results(df)
        total_df = pd.concat([total_df, pd.DataFrame([results])], ignore_index=True)
    return total_df

# Run code
if __name__ == "__main__":
    summary_df = process_files()
    print(summary_df)
    summary_df.to_csv("chatGPT_summary.csv", index=False)
    

    Packet Count  Total Length  Avg Interval (s)  Max Interval (s)  \
0           1157        905918          0.038265          4.988440   
1           1369       1009091          0.037625          9.353463   
2           2057       1439098          0.070551         15.034570   
3            659        421478          0.070241          5.140540   
4           1582        847846          0.084445          5.099727   
5            412        253578          0.065528          2.516216   
6           1502       1157428          0.023048          3.213679   
7           1587       1087830          0.045253          6.565021   
8           1982        973849          0.017210          1.455924   
9            512        304259          0.083047         11.884945   
10           927        762014          0.019915          4.614494   
11          3537       1481954          0.012856          2.824202   
12          1104        858374          0.033415         10.239505   
13          1313    

In [28]:
# combine the csv, split to train and test file 
from sklearn.model_selection import train_test_split
csv1 = pd.read_csv('chatGPT_summary.csv')
csv2 = pd.read_csv('linkedin_summary.csv')
csv3 = pd.read_csv('reddit_summary.csv')
csv4 = pd.read_csv('wikipedia_summary.csv')

csv3['Min Interval (s)']=csv3['Min Interval (s)'].astype(float)
csv3.to_csv('reddit_summary.csv', index=False)
print(csv3)
combined_df = pd.concat([csv1, csv2, csv3, csv4], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
#combined_df = combined_df.drop('Unnamed: 0', axis=1)


# Split into train and test sets (75% training set, 25% tesing set)
train_df, test_df = train_test_split(combined_df, test_size=0.25, random_state=42)
train_df['Min Interval (s)']=train_df['Min Interval (s)'].astype(float)
test_df['Min Interval (s)']=test_df['Min Interval (s)'].astype(float)
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
print(train_df)

    Packet Count  Total Length  Avg Interval (s)  Max Interval (s)  \
0             16          2194          3.829345         44.641063   
1           2603       1507206          0.012092          1.270366   
2              8           958          0.693080          4.829743   
3           2384       1226273          0.009591          1.390694   
4            729        225431          0.068412          1.699636   
5           2222       1434237          0.006531          0.902013   
6           3724       3425019          0.013319          1.273344   
7           2858       1347411          0.016167          1.992624   
8           2551       1706454          0.016709          1.013556   
9           8534       8053355          0.004077          1.261350   
10          1883        900887          0.014460          1.151370   
11          1173        748607          0.014721          1.535492   
12          1632        663671          0.024266          1.423649   
13          1058    