In [2]:
import pandas as pd


In [None]:
# Extract Features
def count_packet(df):
    return len(df)

def total_length(df):
    return df["Length"].sum()

def average_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().mean()

def max_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().max()

def min_packet_interval(df):
    df = df.sort_values("Time")
    return df["Time"].diff().min()

def average_packet_length(df):
    return df["Length"].mean()

def max_packet_length(df):
    return df["Length"].max()

def min_packet_length(df):
    return df["Length"].min()

def most_common_packet_length(df):
    return df["Length"].mode().iloc[0] if not df["Length"].mode().empty else None


def get_results(df, label="Reddit"): # Sign Labels based on actual Website Name
    return {
        "Packet Count": count_packet(df),
        "Total Length": total_length(df),
        "Avg Interval (s)": average_packet_interval(df),
        "Max Interval (s)": max_packet_interval(df),
        "Min Interval (s)": min_packet_interval(df),
        "Avg Length (bytes)": average_packet_length(df),
        "Max Length (bytes)": max_packet_length(df),
        "Min Length (bytes)": min_packet_length(df),
        "Most Common Length (bytes)": most_common_packet_length(df),
        "Label": label
    }

# Build Files 
def process_files(file_prefix="reddit_csv", file_range=range(1, 16)): # used actual data prefix
    total_df = pd.DataFrame()
    for i in file_range:
        df = pd.read_csv(f"{file_prefix}{i}.csv")
        results = get_results(df)
        total_df = pd.concat([total_df, pd.DataFrame([results])], ignore_index=True)
    return total_df

# Run code
if __name__ == "__main__":
    summary_df = process_files()
    print(summary_df)
    summary_df.to_csv("reddit_summary.csv", index=False)
    

    Packet Count  Total Length  Avg Interval (s)  Max Interval (s)  \
0             16          2194          3.829345         44.641063   
1           2603       1507206          0.012092          1.270366   
2              8           958          0.693080          4.829743   
3           2384       1226273          0.009591          1.390694   
4            729        225431          0.068412          1.699636   
5           2222       1434237          0.006531          0.902013   
6           3724       3425019          0.013319          1.273344   
7           2858       1347411          0.016167          1.992624   
8           2551       1706454          0.016709          1.013556   
9           8534       8053355          0.004077          1.261350   
10          1883        900887          0.014460          1.151370   
11          1173        748607          0.014721          1.535492   
12          1632        663671          0.024266          1.423649   
13          1058    

In [None]:
# combine the csv, split to train and test file 
from sklearn.model_selection import train_test_split
csv1 = pd.read_csv('chatGPTdataSummary.csv')
csv2 = pd.read_csv('linkedin_summary.csv')
csv3 = pd.read_csv('reddit_summary.csv')
csv4 = pd.read_csv('wikipedia_summary.csv')


combined_df = pd.concat([csv1, csv2, csv3, csv4], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
combined_df = combined_df.drop('Unnamed: 0', axis=1)


# Split into train and test sets (75% training set, 25% tesing set)
train_df, test_df = train_test_split(combined_df, test_size=0.25, random_state=42)

train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
print(train_df)

    Packet Count  Total Length  Avg Interval (s)  Max Interval (s)  \
56             8           958          0.693080          4.829743   
46          3724       3425019          0.013319          1.273344   
16           786        592867          0.592949         45.006885   
34          1313        953966          0.032956          2.598565   
42           991       1182976          0.078634         16.110115   
28          1883        900887          0.014460          1.151370   
7           3619       2730920          0.012766          3.875907   
61           332        352540          0.289238         82.263481   
40          1502       1157428          0.023048          3.213679   
50           401        411863          0.330857         45.003131   
45          4187       2861188          0.021853          5.687310   
19          1173        748607          0.014721          1.535492   
55           957        812437          0.566170         45.008921   
39          2174    