In [1]:
import pandas as pd

In [1]:
import pandas as pd
import json

def generate_user_prompt(user_data):
    user_id = user_data['user'].iloc[0] if pd.notna(user_data['user'].iloc[0]) else "unknown"

    logs = []
    for _, row in user_data.iterrows():
        # Format date and time
        dt = pd.to_datetime(row['date'])
        time_str = dt.strftime("%H:%M")
        hour = dt.hour

        activity_code = int(row['activity_code']) if pd.notna(row['activity_code']) else -1
        summary = ""

        if activity_code == 1:  # Logon
            summary = f"Logged in at {time_str}"

        elif activity_code == 2:  # Logoff
            summary = f"Logged off at {time_str}"

        elif activity_code == 3:  # Device Connect
            summary = f"Connected a USB device at {time_str}"

        elif activity_code == 4:  # Device Disconnect
            summary = f"Disconnected a USB device at {time_str}"

        elif activity_code == 6:  # File Access
            filename = row['filename'] if pd.notna(row['filename']) else "a file"
            file_content = row.get('content') or ""
            file_content = str(file_content).strip()

            if len(file_content) > 300:
                file_content = file_content[:300] + "..."

            summary = f"Accessed file {filename} at {time_str}"
            if file_content:
                summary += f"\nFile content: {file_content}"

        elif activity_code == 5:  # Email
            to_raw = row.get('to')
            if pd.notna(to_raw):
                to_list = [email.strip() for email in str(to_raw).split(';') if email.strip()]
            else:
                to_list = []
        
            bcc_raw = row.get('bcc')
            if pd.notna(bcc_raw):
                bcc_list = [email.strip() for email in str(bcc_raw).split(';') if email.strip()]
            else:
                bcc_list = []
        
            from_email = row.get('from')
            if pd.isna(from_email):
                from_email = "unknown"
        
            size = int(row['size']) if pd.notna(row['size']) else 0
            content = row.get('content_clean') or row.get('content') or ""
            content = str(content).strip()
            if pd.isna(content):
                content = ""
        
            if len(content) > 300:
                content = content[:300] + "..."
        
            summary = f"Sent an email from {from_email} to {', '.join(to_list) if to_list else 'unknown'} at {time_str}"
            if size > 5000:
                summary += f" with a large attachment ({size} KB)"
            if bcc_list:
                summary += " using BCC"
            if content:
                summary += f"\nEmail content: {content}"


        elif activity_code == 7:  # HTTP
            url = row['url'] if pd.notna(row['url']) else "an external site"
            summary = f"Visited website {url} at {time_str}"

        else:
            summary = f"Performed unknown activity at {time_str}"

        # Add time-of-day label
        if hour < 6 or hour >= 18:
            summary += " (outside business hours)"
        else:
            summary += " during business hours"

        logs.append(summary)

    # Combine logs
    full_log = "\n".join(logs)

    return f"User: {user_id}\nActivities:\n{full_log}\nWhat is the risk level?"

def generate_finetune_messages(df):
    training_data = []

    for user_id, user_data in df.groupby('user'):
        prompt_text = generate_user_prompt(user_data)

        label = user_data['label'].iloc[0] if 'label' in user_data.columns else "Normal activity. No insider threat detected. Risk Level : Low."

        sample = {
            "messages": [
                {"role": "system", "content": "You are a cybersecurity analyst detecting insider threats from activity logs."},
                {"role": "user", "content": prompt_text},
                {"role": "assistant", "content": label}
            ]
        }

        training_data.append(sample)

    return training_data
    
def generate_finetune_messages_malicious(df):
    training_data = []

    for user_id, user_data in df.groupby('user'):
        prompt_text = generate_user_prompt(user_data)

        label = user_data['label'].iloc[0] if 'label' in user_data.columns else "Malicious activity detected. Insider threat detected. Risk Level : High."

        sample = {
            "messages": [
                {"role": "system", "content": "You are a cybersecurity analyst detecting insider threats from activity logs."},
                {"role": "user", "content": prompt_text},
                {"role": "assistant", "content": label}
            ]
        }

        training_data.append(sample)

    return training_data
    
# Normal training sample
normal_df = pd.read_csv("train_sample.csv")
finetune_data = generate_finetune_messages(normal_df)

# Malicious training sample
malicious_df = pd.read_csv("training_malicious.csv")
finetune_data_malicious = generate_finetune_messages_malicious(malicious_df)

# Combine both sets
combined_finetune_data = finetune_data + finetune_data_malicious

# Save to proper .jsonl file
with open("finetune_data.json", "w") as f:
    for example in combined_finetune_data:
        json.dump(example, f)
        f.write("\n")

print("Fine-tuning data saved to finetune_data.json")

Fine-tuning data saved to finetune_data.json
