In [4]:
import pandas as pd

df = pd.read_csv("full_dataset.csv")

In [5]:
df.describe()

Unnamed: 0,instruction,response
count,54927,54927
unique,54907,36030
top,Question: In the context of smart contract dev...,```python\n# Import necessary libraries\nimpor...
freq,2,2


In [17]:
import requests
import json
import time
from tqdm import tqdm

def get_llm_category(instruction, response):
    # Only send instruction for categorization
    messages = [{
        "role": "user",
        "content": f"""You are assisting in categorizing cyber attack descriptions. 
This is purely for research and educational categorization purposes.
Based on the following attack description, classify it into exactly ONE of these categories:

1. Malware: Malicious software designed to harm systems, steal data, or disrupt operations. This includes ransomware, spyware, Trojans, and worms.
2. Social Engineering and Phishing: Attacks that manipulate people into revealing sensitive information or taking harmful actions. This includes fraudulent messages, impersonation, and scams to trick victims.
3. Man-in-the-Middle (MITM) Attacks: Attacks where hackers intercept and manipulate communication between two parties to steal sensitive information. Often occurs on unsecured networks.
4. Denial-of-Service (DoS/DDoS) Attacks: Attacks that overwhelm a system, website, or service with excessive traffic to make it inoperable.
5. Zero-Day Exploits: Attacks exploiting previously unknown vulnerabilities in software, hardware, or firmware before a patch is available.
6. Password Attacks: Attacks aimed at guessing or stealing passwords to gain unauthorised access to accounts or systems.
7. Internet of Things (IoT) Attacks: Attacks that exploit vulnerabilities in connected devices like smart home devices or industrial systems.
8. Injection Attacks: Attacks where malicious code is inserted into applications to manipulate data or systems. This includes SQL injection and cross-site scripting.

{instruction}

Respond with ONLY the category number (1-8), nothing else."""
    }]

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer 2fKCybMO5TwjDafZQZ6WguCQVZSIfg2F"
    }
    
    data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "messages": messages
    }

    MAX_RETRIES = 3
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(
                "https://api.deepinfra.com/v1/openai/chat/completions",
                headers=headers,
                json=data
            )
            
            if response.status_code == 200:
                result = response.json()['choices'][0]['message']['content'].strip()
                try:
                    category_num = int(''.join(filter(str.isdigit, result)))
                    if 1 <= category_num <= 8:
                        return category_num
                except:
                    return 1  # Default to Malware category if parsing fails
            else:
                print(f"API request failed with status code: {response.status_code}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                continue
                
        except Exception as e:
            print(f"Error during API call: {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(2 ** attempt)
            continue
    
    return 1  # Default to Malware category if all retries fail

# Category mapping for reference
CATEGORY_NAMES = {
    1: "Malware",
    2: "Social Engineering and Phishing",
    3: "Man-in-the-Middle (MITM) Attacks",
    4: "Denial-of-Service (DoS/DDoS) Attacks", 
    5: "Zero-Day Exploits",
    6: "Password Attacks",
    7: "Internet of Things (IoT) Attacks",
    8: "Injection Attacks"
}

def categorize_attacks(dataframe, num_records=None, save_output=True):
    """
    Categorize cyber attacks using LLM API
    
    Parameters:
    dataframe (pd.DataFrame): Input dataframe containing attack records
    num_records (int): Number of records to process (None for all records)
    save_output (bool): Whether to save results to CSV
    
    Returns:
    pd.DataFrame: Processed dataframe with attack categories
    """
    
    print(f"Starting attack categorization for {'all' if num_records is None else num_records} records...")
    
    # Create a copy of the dataframe to avoid modifying the original
    df_to_process = dataframe.copy()
    
    if num_records is not None:
        df_to_process = df_to_process.head(num_records)
    
    # Process in smaller batches to show progress and handle rate limits
    batch_size = 10  # Smaller batch size due to API rate limits
    total_batches = (len(df_to_process) + batch_size - 1) // batch_size

    with tqdm(total=len(df_to_process)) as pbar:
        for i in range(0, len(df_to_process), batch_size):
            batch = df_to_process.iloc[i:i+batch_size]
            
            # Add category number column
            df_to_process.loc[batch.index, 'attack_category_num'] = batch.apply(
                lambda x: get_llm_category(x['instruction'], x['response']), 
                axis=1
            )
            
            # Add category name column
            df_to_process.loc[batch.index, 'attack_category'] = df_to_process.loc[batch.index, 'attack_category_num'].map(CATEGORY_NAMES)
            
            pbar.update(len(batch))
            
            # Add small delay to respect rate limits
            time.sleep(1)

    # Display distribution of categories
    print("\nAttack Category Distribution:")
    category_counts = df_to_process['attack_category'].value_counts()
    print(category_counts)

    # Calculate and display percentage distribution
    print("\nPercentage Distribution:")
    percentage_dist = (category_counts / len(df_to_process) * 100).round(2)
    print(percentage_dist)

    # Display sample of categorized data
    print("\nSample of Categorized Data:")
    print(df_to_process[['instruction', 'attack_category']].head())

    if save_output:
        output_filename = f'categorized_attacks{"_" + str(num_records) if num_records else ""}.csv'
        df_to_process.to_csv(output_filename, index=False)
        print(f"\nCategorized data saved to '{output_filename}'")
    
    return df_to_process


In [20]:
# Process entire dataset
df_all = categorize_attacks(df)

Starting attack categorization for 1000 records...


100%|██████████| 1000/1000 [18:57<00:00,  1.14s/it]


Attack Category Distribution:
attack_category
Malware                                 494
Zero-Day Exploits                       191
Man-in-the-Middle (MITM) Attacks         79
Social Engineering and Phishing          78
Injection Attacks                        70
Password Attacks                         48
Internet of Things (IoT) Attacks         21
Denial-of-Service (DoS/DDoS) Attacks     19
Name: count, dtype: int64

Percentage Distribution:
attack_category
Malware                                 49.4
Zero-Day Exploits                       19.1
Man-in-the-Middle (MITM) Attacks         7.9
Social Engineering and Phishing          7.8
Injection Attacks                        7.0
Password Attacks                         4.8
Internet of Things (IoT) Attacks         2.1
Denial-of-Service (DoS/DDoS) Attacks     1.9
Name: count, dtype: float64

Sample of Categorized Data:
                                             instruction  \
24869  Question: In the context of malware analysis, ...


