In [1]:
import pandas as pd
import os

logon_df = pd.read_csv('logon_clean.csv')
device_df = pd.read_csv('device_cleaned.csv')
email_df = pd.read_csv('email_cleaned.csv')
file_df = pd.read_csv('file_cleaned.csv')
http_df = pd.read_csv('http_cleaned.csv')

In [2]:
from collections import Counter

# Combine user frequencies across all datasets
user_counts = Counter()

for df in [logon_df, device_df, email_df, file_df, http_df]:
    if 'user' in df.columns:
        users = df['user']
    else:
        continue
    user_counts.update(users)

# Get top 25 active users
top_users = [user for user, _ in user_counts.most_common(25)]

# List malicious users
malicious_users = ['RKD0604', 'TAP0551', 'WDD0366', 'MCF0600', 'MYD0978', 'PPF0435', 'RAB0589', 'RGG0064', 'KLH0596', 'KPC0073', 'LJR0523', 'LQC0479', 'MAR0955', 'MAS0025', 'FMG0527', 'FTM0406', 'GHL0460', 'HJB0742', 'JMB0308', 'JRG0207', 'BTL0226', 'CAH0936', 'DCH0843', 'EHB0824', 'EHD0584']

# Use a set to remove duplicates
test_users = list(set(top_users))

print(test_users)

['JCR0172', 'MDM0625', 'BTW0005', 'DLM0051', 'HSB0196', 'AJR0319', 'TVS0050', 'MSS0001', 'TVS0006', 'QRM0739', 'HCS0003', 'GKO0078', 'OBH0499', 'HTH0007', 'KWC0004', 'THR0873', 'KBP0008', 'LBF0214', 'JDB0169', 'RZC0746', 'HRB0351', 'KKW0879', 'ATE0869', 'HPH0075', 'NAF0326']


In [3]:
def sample_per_user(df, user_col, activity_code, n=3):
    df = df.copy()
    df['activity_code'] = activity_code
    samples = []
    for user in test_users:
        user_df = df[df[user_col] == user]
        if len(user_df) >= n:
            samples.append(user_df.sample(n=n, random_state=42))
        else:
            samples.append(user_df)
    return pd.concat(samples)

# Step 3: Handle logon and logoff from the same file
logon_data = logon_df[logon_df['activity'] == 'logon']
logoff_data = logon_df[logon_df['activity'] == 'logoff']

logon_sample = sample_per_user(logon_data, 'user', activity_code=1)
logoff_sample = sample_per_user(logoff_data, 'user', activity_code=2)

# Step 4: Device connect/disconnect
device_connect_sample = sample_per_user(device_df[device_df['activity'] == 'Connect'], 'user', activity_code=3)
device_disconnect_sample = sample_per_user(device_df[device_df['activity'] == 'Disconnect'], 'user', activity_code=4)

email_sample = sample_per_user(email_df, 'user', activity_code=5)

file_sample = sample_per_user(file_df, 'user', activity_code=6)

http_sample = sample_per_user(http_df, 'user', activity_code=7)

# Save to CSV
logon_sample.to_csv("logon_sample.csv", index=False)
logoff_sample.to_csv("logoff_sample.csv", index=False)
device_connect_sample.to_csv("connect_sample.csv", index=False)
device_disconnect_sample.to_csv("disconnect_sample.csv", index=False)
file_sample.to_csv("file_sample.csv", index=False)
email_sample.to_csv("email_sample.csv", index=False)
http_sample.to_csv("http_sample.csv", index=False)

print("[INFO] All sample files saved successfully.")

[INFO] All sample files saved successfully.


In [13]:
import glob

# Load all sample CSVs
sample_files = glob.glob("sampledata/*_sample.csv")

# Load and concatenate all CSVs
dfs = [pd.read_csv(file) for file in sample_files]
all_data = pd.concat(dfs, ignore_index=True)

print(f"[INFO] Combined dataset shape: {all_data.shape}")
print(all_data.head())

all_data.to_csv("sample_data.csv", index=False)

[INFO] Combined dataset shape: (444, 19)
                         id                 date     user       pc activity  \
0  {U2G7-L9YT49HE-6525JFWT}  2011-02-16 09:51:54  JCR0172  PC-6713  Connect   
1  {K3R3-C3MC71YF-2488RIQV}  2010-04-24 11:24:43  JCR0172  PC-6713  Connect   
2  {Z4E8-V1QQ69DL-7721ZTUE}  2011-01-03 14:18:43  JCR0172  PC-6713  Connect   
3  {I8J7-L8WD87PG-2901JDTT}  2010-07-02 15:30:22  MDM0625  PC-4984  Connect   
4  {F1M0-S0VN97GB-3740KSYM}  2010-12-03 14:16:25  MDM0625  PC-4984  Connect   

   hour  dayofweek  activity_binary  activity_code   to   cc  bcc from  size  \
0     9          2              1.0              3  NaN  NaN  NaN  NaN   NaN   
1    11          5              1.0              3  NaN  NaN  NaN  NaN   NaN   
2    14          0              1.0              3  NaN  NaN  NaN  NaN   NaN   
3    15          4              1.0              3  NaN  NaN  NaN  NaN   NaN   
4    14          4              1.0              3  NaN  NaN  NaN  NaN   NaN   

   