In [1]:
import pandas as pd
import glob

df = pd.read_csv("malicious.csv")

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek

print(df.head())

                         id                date     user       pc    activity  \
0  {B1W4-B6UV58BM-0180ZJPC} 2011-01-05 22:18:00  FMG0527  PC-4256         NaN   
1  {J1T0-Y8BT51ME-2531GFMO} 2011-01-05 22:21:00  FMG0527  PC-4256  Disconnect   
2  {H5W6-N8BP07OO-9909ZNZE} 2011-01-05 22:48:00  FMG0527  PC-4256      Logoff   
3  {T6G6-X8PO85WD-5105HBYP} 2011-01-11 23:33:00  FMG0527  PC-4256       Logon   
4  {B2O5-V7VT00CH-8680KMLZ} 2011-01-11 23:43:00  FMG0527  PC-4256     Connect   

   hour  dayofweek  activity_binary  activity_code  to  cc  bcc  from  size  \
0    22          2              NaN            NaN NaN NaN  NaN   NaN   NaN   
1    22          2              NaN            NaN NaN NaN  NaN   NaN   NaN   
2    22          2              NaN            NaN NaN NaN  NaN   NaN   NaN   
3    23          1              NaN            NaN NaN NaN  NaN   NaN   NaN   
4    23          1              NaN            NaN NaN NaN  NaN   NaN   NaN   

   attachments                        

In [2]:
# Normalize activity column
df['activity'] = df['activity'].str.strip().str.capitalize()

# Initialize with NaN
df['activity_code'] = pd.NA

# Assign codes based on activity column
df.loc[df['activity'] == 'Logon', 'activity_code'] = 1
df.loc[df['activity'] == 'Logoff', 'activity_code'] = 2
df.loc[df['activity'] == 'Connect', 'activity_code'] = 3
df.loc[df['activity'] == 'Disconnect', 'activity_code'] = 4

# File Access: filename present
df.loc[df['filename'].notna() & (df['filename'] != ''), 'activity_code'] = 5

# Email: any of to/cc/bcc/from present
email_cols = ['to', 'cc', 'bcc', 'from']
email_condition = df[email_cols].notna().any(axis=1)
df.loc[email_condition, 'activity_code'] = 6

# HTTP: url present
df.loc[df['url'].notna() & (df['url'] != ''), 'activity_code'] = 7

# Save corrected data
df.to_csv("corrected_malicious.csv", index=False)

print("[INFO] Activity codes assigned and saved.")


[INFO] Activity codes assigned and saved.


In [5]:
# Load both CSV files
df_sample = pd.read_csv("sample_data.csv")
df_malicious = pd.read_csv("corrected_malicious.csv")

# Combine them (stacking them vertically)
combined_df = pd.concat([df_sample, df_malicious], ignore_index=True)

# Save the combined file
combined_df.to_csv("combined_dataset.csv", index=False)

print("[INFO] Combined CSV saved as 'combined_dataset.csv'")


[INFO] Combined CSV saved as 'combined_dataset.csv'


In [7]:
def generate_user_prompt(user_data):
    user_id = user_data['user'].iloc[0] if pd.notna(user_data['user'].iloc[0]) else "unknown"

    logs = []
    for _, row in user_data.iterrows():
        # Format date and time
        dt = pd.to_datetime(row['date'])
        time_str = dt.strftime("%H:%M")
        hour = dt.hour

        activity_code = int(row['activity_code']) if pd.notna(row['activity_code']) else -1
        summary = ""

        if activity_code == 1:  # Logon
            summary = f"Logged in at {time_str}"

        elif activity_code == 2:  # Logoff
            summary = f"Logged off at {time_str}"

        elif activity_code == 3:  # Device Connect
            summary = f"Connected a USB device at {time_str}"

        elif activity_code == 4:  # Device Disconnect
            summary = f"Disconnected a USB device at {time_str}"

        elif activity_code == 6:  # File Access
            filename = row['filename'] if pd.notna(row['filename']) else "a file"
            file_content = row.get('content') or ""
            file_content = str(file_content).strip()

            if len(file_content) > 300:
                file_content = file_content[:300] + "..."

            summary = f"Accessed file {filename} at {time_str}"
            if file_content:
                summary += f"\nFile content: {file_content}"

        elif activity_code == 5:  # Email
            to_raw = row.get('to')
            if pd.notna(to_raw):
                to_list = [email.strip() for email in str(to_raw).split(';') if email.strip()]
            else:
                to_list = []
        
            bcc_raw = row.get('bcc')
            if pd.notna(bcc_raw):
                bcc_list = [email.strip() for email in str(bcc_raw).split(';') if email.strip()]
            else:
                bcc_list = []
        
            from_email = row.get('from')
            if pd.isna(from_email):
                from_email = "unknown"
        
            size = int(row['size']) if pd.notna(row['size']) else 0
            content = row.get('content_clean') or row.get('content') or ""
            content = str(content).strip()
            if pd.isna(content):
                content = ""
        
            if len(content) > 300:
                content = content[:300] + "..."
        
            summary = f"Sent an email from {from_email} to {', '.join(to_list) if to_list else 'unknown'} at {time_str}"
            if size > 5000:
                summary += f" with a large attachment ({size} KB)"
            if bcc_list:
                summary += " using BCC"
            if content:
                summary += f"\nEmail content: {content}"


        elif activity_code == 7:  # HTTP
            url = row['url'] if pd.notna(row['url']) else "an external site"
            summary = f"Visited website {url} at {time_str}"

        else:
            summary = f"Performed unknown activity at {time_str}"

        # Add time-of-day label
        if hour < 6 or hour >= 18:
            summary += " (outside business hours)"
        else:
            summary += " during business hours"

        logs.append(summary)

    # Combine all logs for the user
    full_log = "\n".join(logs)

    return {
        "role": "user",
        "content": f"User: {user_id}\nActivities:\n{full_log}\nWhat is the risk level?"
    }

def generate_user_prompts(df):
    user_prompts = []
    for user_id, user_data in df.groupby('user'):
        prompt = generate_user_prompt(user_data)
        user_prompts.append(prompt)
    return user_prompts


In [11]:
import json
import pandas as pd
import os

df = pd.read_csv('combined_dataset.csv')

# Generate prompts for each user
prompts = []
for user_id, user_data in df.groupby('user'):
    prompt = generate_user_prompt(user_data)
    prompts.append(prompt)

# Print the generated prompts as JSON-like for few-shot use
print(json.dumps(prompts, indent=2))

# Optionally, save to a JSON file
with open('user_prompts.json', 'w') as f:
    json.dump(prompts, f, indent=4)

[
  {
    "role": "user",
    "content": "User: AJR0319\nActivities:\nSent an email from AJR5@charter.net to Molly-Zimmerman@hotmail.com at 14:06 with a large attachment (50966 KB) using BCC\nEmail content: 33 vegas campaign patrick undefeated hit lack or 26 influenced hometown recalled including assigned allowing met punch 8 from network increased together germany silver mat 1973 chicago hit straight effort erik czech boards which 2 left 33 spent old at became near easily asked segment strength going ... during business hours\nSent an email from Arthur.Jacob.Raymond@dtaa.com to Meghan.Brianna.Jensen@dtaa.com, Jaquelyn.Cassandra.Roberson@dtaa.com at 08:38 with a large attachment (40292 KB)\nEmail content: course already now surveillance 1974 roosevelt notorious joyce signed presentation 7 identification wanted cell department hill 1997 holding las patrols guard helicopters face rainbow newkill thick speed either bail chancellor 100 aunt assatas related listened short asylum faculty was