In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
import os


In [2]:
# Malicious windows for DATA EXFILTRATION scenarios only
MALICIOUS_WINDOWS = {
    'ACM2278': ('2010-08-18 21:47:42', '2010-08-24 03:48:51'),
    'CMP2946': ('2011-02-07 12:28:06', '2011-03-04 12:30:25'),
    'CDE1846': ('2011-02-21 11:43:39', '2011-04-25 17:55:00'),
    'MBG3183': ('2010-10-12 13:21:59', '2010-10-12 13:22:56'),
}


In [3]:
FILES = {
    'logon': '../Dataset/r6.2/logon.csv',
    'device': '../Dataset/r6.2/device.csv',
    'http': '../Dataset/r6.2/http.csv',
    'email': '../Dataset/r6.2/email.csv',
    'file': '../Dataset/r6.2/file.csv'
}


In [4]:
def load_and_parse(filepath, date_col='date'):
    print(f"Loading {filepath}")
    
    if not os.path.exists(filepath):
        print(f"[!] Missing file: {filepath}")
        return pd.DataFrame()
    
    df = pd.read_csv(filepath, encoding='ISO-8859-1')
    df[date_col] = pd.to_datetime(
        df[date_col],
        format='%m/%d/%Y %H:%M:%S',
        errors='coerce'
    )
    df = df.dropna(subset=[date_col])
    return df


In [5]:
logon_df  = load_and_parse(FILES['logon'])
device_df = load_and_parse(FILES['device'])
http_df   = load_and_parse(FILES['http'])
email_df  = load_and_parse(FILES['email'])
file_df   = load_and_parse(FILES['file'])

print("All datasets loaded.")


Loading ../Dataset/r6.2/logon.csv
Loading ../Dataset/r6.2/device.csv
Loading ../Dataset/r6.2/http.csv
Loading ../Dataset/r6.2/email.csv
Loading ../Dataset/r6.2/file.csv
All datasets loaded.


In [6]:
# Convert user to categorical (huge RAM reduction)
for df in [logon_df, email_df, http_df, device_df, file_df]:
    if not df.empty:
        df['user'] = df['user'].astype('category')


In [7]:
sessions = []

logon_df = logon_df.sort_values(['user', 'date'])

for user, user_logs in logon_df.groupby('user'):
    user_logs = user_logs.reset_index(drop=True)
    
    i = 0
    while i < len(user_logs):
        row = user_logs.iloc[i]
        
        if row['activity'] == 'Logon':
            start = row['date']
            pc = row['pc']
            sid = row['id']
            end = None
            
            j = i + 1
            while j < len(user_logs):
                nxt = user_logs.iloc[j]
                if nxt['pc'] == pc:
                    if nxt['activity'] == 'Logoff':
                        end = nxt['date']
                        i = j
                        break
                    elif nxt['activity'] == 'Logon':
                        break
                j += 1
            
            if end is None:
                end = start + timedelta(hours=12)
            
            sessions.append({
                'id': sid,
                'user': user,
                'pc': pc,
                'session_start': start,
                'session_end': end,
                'duration_sec': (end - start).total_seconds()
            })
        i += 1

sessions_df = pd.DataFrame(sessions)
print(f"Sessions created: {len(sessions_df)}")


  for user, user_logs in logon_df.groupby('user'):


Sessions created: 1881109


In [8]:
for df in [email_df, http_df, device_df, file_df]:
    if not df.empty:
        df.sort_values(['user', 'date'], inplace=True)
        df.set_index('date', inplace=True)


In [9]:
email_groups  = dict(tuple(email_df.groupby('user')))
http_groups   = dict(tuple(http_df.groupby('user')))
device_groups = dict(tuple(device_df.groupby('user')))
file_groups   = dict(tuple(file_df.groupby('user')))


  email_groups  = dict(tuple(email_df.groupby('user')))
  http_groups   = dict(tuple(http_df.groupby('user')))
  device_groups = dict(tuple(device_df.groupby('user')))
  file_groups   = dict(tuple(file_df.groupby('user')))


In [10]:
final_rows = []

for user, user_sessions in sessions_df.groupby('user'):
    
    u_email  = email_groups.get(user, pd.DataFrame())
    u_http   = http_groups.get(user, pd.DataFrame())
    u_device = device_groups.get(user, pd.DataFrame())
    u_file   = file_groups.get(user, pd.DataFrame())
    
    for _, s in user_sessions.iterrows():
        start, end = s['session_start'], s['session_end']
        
        sess_email = u_email.loc[start:end] if not u_email.empty else u_email
        sess_http  = u_http.loc[start:end] if not u_http.empty else u_http
        sess_dev   = u_device.loc[start:end] if not u_device.empty else u_device
        sess_file  = u_file.loc[start:end] if not u_file.empty else u_file
        
        email_text = " | ".join(
            f"TO:{r.get('to','')} BODY:{r.get('content','')}"
            for _, r in sess_email.iterrows()
        )
        
        http_text = " | ".join(
            f"{r.get('url','')} {r.get('content','')}"
            for _, r in sess_http.iterrows()
        )
        
        files_usb = 0
        if 'to_removable_media' in sess_file.columns:
            files_usb = sess_file['to_removable_media'].sum()
        
        label = 0
        if user in MALICIOUS_WINDOWS:
            ms = pd.to_datetime(MALICIOUS_WINDOWS[user][0])
            me = pd.to_datetime(MALICIOUS_WINDOWS[user][1])
            if (start <= me) and (end >= ms):
                label = 1
        
        final_rows.append({
            'id': s['id'],
            'user': user,
            'session_start': start,
            'session_end': end,
            'duration': s['duration_sec'],
            'logon_activity': 1,
            'email_activity': len(sess_email),
            'email_content': email_text,
            'http_activity': len(sess_http),
            'http_content': http_text,
            'device_activity': len(sess_dev),
            'file_activity': len(sess_file),
            'files_copied_to_usb': files_usb,
            'label': label
        })


In [11]:
df_final = pd.DataFrame(final_rows)

output = "cert_r6.2_session_dataset.csv"
df_final.to_csv(output, index=False)

print("Dataset created")
print("Total sessions:", len(df_final))
print("Exfiltration sessions:", df_final['label'].sum())

df_final.head()


Dataset created
Total sessions: 1881109
Exfiltration sessions: 92


Unnamed: 0,id,user,session_start,session_end,duration,logon_activity,email_activity,email_content,http_activity,http_content,device_activity,file_activity,files_copied_to_usb,label
0,{Z9U4-E5DO62XF-0940IUMN},AAB0162,2010-01-04 07:41:00,2010-01-04 18:46:00,39900.0,1,9,TO:Tyrone.Axel.Prince@dtaa.com BODY:The forest...,95,http://barnesandnoble.com/Joseph_Szigeti/hubay...,0,0,0,0
1,{T4O2-L6DA48LD-1984MYGZ},AAB0162,2010-01-05 07:46:00,2010-01-05 18:40:00,39240.0,1,9,TO:Amos.Ahmed.Burch@dtaa.com;Michael.Darius.Pe...,95,http://pcmag.com/Bill_Ponsford/cripes/1996_Hav...,0,0,0,0
2,{V1K3-F2NB85JZ-3087FZKU},AAB0162,2010-01-06 07:45:00,2010-01-06 18:55:00,40200.0,1,9,TO:Germaine.Jane.Sutton@dtaa.com BODY:No Frank...,95,http://chase.com/Ediacara_biota/ediacara/Senax...,0,0,0,0
3,{K9G3-Z1VF57JC-7874ZYZM},AAB0162,2010-01-07 07:45:00,2010-01-07 18:43:00,39480.0,1,9,TO:Amos.Ahmed.Burch@dtaa.com;Candace.Tatum.Hug...,95,http://foxsports.com/Psittacosaurus/psittacosa...,0,0,0,0
4,{G1N9-G7RU74VO-2662MTBS},AAB0162,2010-01-08 07:50:00,2010-01-08 18:41:00,39060.0,1,9,TO:Kai_Mcleod@netzero.com BODY:During this per...,95,http://pnc.com/Magnetosphere_of_Jupiter/rj/Oev...,0,0,0,0
