In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

class CybersecurityAnnotator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.mlb = MultiLabelBinarizer()
        self.classifier = MultiOutputClassifier(LogisticRegression())

    def prepare_data(self, df):
        # Predefined attack types
        attack_types = [
            'Phishing', 'Ransomware', 'DDoS', 'Malware',
            'Data Breach', 'Insider Threat', 'Social Engineering',
            'Zero-Day Exploit'
        ]

        # Manual labeling function (can be replaced with more advanced method)
        def label_incident(text):
            text = text.lower()
            labels = []
            if 'phishing' in text or 'email' in text:
                labels.append('Phishing')
            if 'ransomware' in text or 'encrypted' in text:
                labels.append('Ransomware')
            if 'ddos' in text or 'disrupted' in text:
                labels.append('DDoS')
            if 'malware' in text or 'infected' in text:
                labels.append('Malware')
            if 'data breach' in text or 'exposed' in text:
                labels.append('Data Breach')
            if 'insider' in text or 'sold' in text:
                labels.append('Insider Threat')
            if 'social engineering' in text:
                labels.append('Social Engineering')
            if 'zero-day' in text or 'vulnerability' in text:
                labels.append('Zero-Day Exploit')
            return labels if labels else ['Unclassified']

        # Apply labeling
        df['attack_types'] = df['Statements'].apply(label_incident)
        return df

    def train_model(self, df):
        # Prepare features
        X = self.vectorizer.fit_transform(df['Statements'])

        # Prepare labels
        y = self.mlb.fit_transform(df['attack_types'])

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Train model
        self.classifier.fit(X_train, y_train)

        # Evaluate
        accuracy = self.classifier.score(X_test, y_test)
        print(f"Model Accuracy: {accuracy:.2f}")

        return self

    def predict(self, incidents):
        # Transform new incidents
        X_new = self.vectorizer.transform(incidents)

        # Predict
        predictions = self.classifier.predict(X_new)

        # Convert back to attack type labels
        return self.mlb.inverse_transform(predictions)

    def annotate_incidents(self, df):
        # Prepare data with initial labeling
        df = self.prepare_data(df)

        # Train model
        self.train_model(df)

        # Predict attack types for all incidents
        df['ml_attack_types'] = self.predict(df['Statements'])

        return df

# Example usage
def main():
    # Load CSV
    df = pd.read_csv('/content/Data Annotaters Task Statements - Sheet1.csv')

    # Initialize and run annotator
    annotator = CybersecurityAnnotator()
    annotated_df = annotator.annotate_incidents(df)

    # Save annotated data
    annotated_df.to_csv('ml_annotated_incidents.csv', index=False)

if __name__ == "__main__":
    main()

Model Accuracy: 0.06


In [6]:
annoted_csv = pd.read_csv('/content/ml_annotated_incidents.csv')

In [7]:
annoted_csv.head()

Unnamed: 0,Statements,Annotated_Date,Company,Industry,Attack Type,Attack Vector,Target/Impact,attack_types,ml_attack_types
0,Statements,Unknown,Unknown,Unknown,Other,Other,Other,['Unclassified'],()
1,"On March 5, 2024, a phishing email was sent to...",2024-03-05,TechCorp,Tech,Phishing,Email,Login Credentials,['Phishing'],()
2,"A ransomware attack on April 12, 2024, encrypt...",Unknown,Unknown,Unknown,Ransomware,Other,Financial Data,['Ransomware'],()
3,Unauthorized access was detected in the HR dat...,Unknown,Unknown,Unknown,Unauthorized Access,Unauthorized Access,Other,['Unclassified'],()
4,"On June 15, 2024, a Distributed Denial of Serv...",2024-06-15,Unknown,Unknown,Other,Other,Other,['DDoS'],()


In [None]:
import pandas as pd
import numpy as np
import re

def annotate_cybersecurity_incidents(csv_path):
    # Read CSV file
    df = pd.read_csv(csv_path)

    # Define attack type classification function
    def classify_attack(description):
        attack_types = {
            'Phishing': ['phishing', 'email'],
            'Ransomware': ['ransomware', 'encrypted'],
            'DDoS': ['ddos', 'disrupted', 'downtime'],
            'Malware': ['malware', 'infected', 'infection'],
            'Data Breach': ['data breach', 'exposed', 'compromise'],
            'Insider Threat': ['insider', 'sold', 'leaking'],
            'Social Engineering': ['social engineering', 'manipulate'],
            'Zero-Day Exploit': ['zero-day', 'vulnerability']
        }

        # Multi-label classification
        detected_attacks = []
        for attack, keywords in attack_types.items():
            if any(keyword in str(description).lower() for keyword in keywords):
                detected_attacks.append(attack)

        return detected_attacks or ['Unclassified']

    # Create new columns for annotations
    df['attack_types'] = df['Statements'].apply(classify_attack)
    df['severity'] = df['attack_types'].apply(lambda x: 'High' if set(x) & {'Data Breach', 'Ransomware', 'Insider Threat'} else 'Medium')

    return df

# Usage
annotated_df = annotate_cybersecurity_incidents('/content/Data Annotaters Task Statements - Sheet1.csv')
annotated_df.to_csv('annotated_incidents.csv', index=False)

In [None]:
mannual_annoted = pd.read_csv('/content/annotated_incidents.csv')

In [None]:
mannual_annoted.tail(15)

Unnamed: 0,Statements,attack_types,severity
135,A DDoS attack disrupted the services of MediaH...,['DDoS'],Medium
136,"On July 18, 2035, unauthorized access to the c...",['Unclassified'],Medium
137,A spear-phishing campaign targeted the IT exec...,['Phishing'],Medium
138,"On September 6, 2035, a data breach exposed pe...",['Data Breach'],High
139,Malware infected the communication systems on ...,['Malware'],Medium
140,"On November 9, 2035, phishing attempts led to ...",['Phishing'],Medium
141,A ransomware attack targeted the research and ...,['Ransomware'],High
142,"On January 17, 2036, unauthorized access was d...",['Unclassified'],Medium
143,A DDoS attack disrupted the online banking por...,['DDoS'],Medium
144,"On March 22, 2036, malware was identified in t...",['Malware'],Medium


In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

import pandas as pd
import numpy as np
import re
from datetime import datetime

class AdvancedCybersecurityAnnotator:
    def __init__(self):
        # More comprehensive date extraction method
        self.month_map = {
            'January': 1, 'February': 2, 'March': 3, 'April': 4,
            'May': 5, 'June': 6, 'July': 7, 'August': 8,
            'September': 9, 'October': 10, 'November': 11, 'December': 12
        }

    def extract_date(self, text):
        try:
            # Expand regex to handle more variations
            date_patterns = [
                r'On\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})',
                r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})'
            ]

            for pattern in date_patterns:
                date_match = re.search(pattern, text, re.IGNORECASE)
                if date_match:
                    month, day, year = date_match.groups()[-3:]
                    month = month.capitalize()
                    return pd.Timestamp(
                        year=int(year),
                        month=self.month_map[month],
                        day=int(day)
                    )
            return None
        except Exception as e:
            print(f"Date extraction error: {text} - {e}")
            return None

    def assess_severity(self, text, outcomes):
        # Severity assessment based on multiple factors
        severity_factors = {
            'High': [
                'data breach', 'exposed', 'compromised',
                'ransomware', 'encrypted',
                'zero-day', 'critical vulnerability'
            ],
            'Medium': [
                'disrupted', 'downtime', 'infected',
                'phishing', 'malware'
            ],
            'Low': [
                'minor', 'limited', 'detected', 'prevented'
            ]
        }

        text_lower = text.lower()

        # Check outcomes
        if any(out in ['Information Exposed', 'Financial Loss'] for out in outcomes):
            return 'High'

        # Check text for severity indicators
        for severity, keywords in severity_factors.items():
            if any(keyword in text_lower for keyword in keywords):
                return severity

        return 'Low'


    def classify_industry(self, text):
        industries = {
            'Finance': ['finance', 'bank', 'payment', 'accounting', 'credit'],
            'Healthcare': ['health', 'medical', 'hospital', 'clinic', 'patient'],
            'Technology': ['tech', 'software', 'it', 'computer', 'digital', 'web'],
            'Retail': ['shop', 'retail', 'store', 'customer', 'sales'],
            'Manufacturing': ['manufacture', 'production', 'factory', 'industrial'],
            'Education': ['edu', 'school', 'learning', 'university'],
            'Logistics': ['logistics', 'supply', 'transport', 'shipping']
        }

        text_lower = text.lower()
        for industry, keywords in industries.items():
            if any(keyword in text_lower for keyword in keywords):
                return industry
        return 'Other'

    def identify_targets(self, text):
        targets = {
            'System': ['database', 'system', 'network', 'server', 'infrastructure'],
            'Department': ['hr', 'finance', 'marketing', 'sales', 'it', 'support'],
            'Personal': ['employee', 'user', 'customer', 'executive', 'account'],
            'Data': ['records', 'information', 'credentials', 'data']
        }

        text_lower = text.lower()
        identified_targets = []
        for target, keywords in targets.items():
            if any(keyword in text_lower for keyword in keywords):
                identified_targets.append(target)

        return identified_targets or ['Unspecified']

    def assess_outcomes(self, text):
        outcomes = {
            'Information Exposed': ['exposed', 'breach', 'leaked', 'compromised'],
            'Systems Disrupted': ['disrupted', 'downtime', 'outage', 'locked'],
            'Financial Loss': ['ransom', 'financial', 'theft', 'monetary'],
            'Operational Impact': ['operations', 'infrastructure', 'critical', 'disabled']
        }

        text_lower = text.lower()
        identified_outcomes = []
        for outcome, keywords in outcomes.items():
            if any(keyword in text_lower for keyword in keywords):
                identified_outcomes.append(outcome)

        return identified_outcomes or ['No Significant Impact']



    def annotate_incidents(self, df):
        # Add annotation columns
        df['extracted_date'] = df['Statements'].apply(self.extract_date)

        # Existing methods
        df['industry'] = df['Statements'].apply(self.classify_industry)
        df['targets'] = df['Statements'].apply(self.identify_targets)
        df['outcomes'] = df['Statements'].apply(self.assess_outcomes)

        # New severity assessment
        df['severity'] = df.apply(
            lambda row: self.assess_severity(row['Statements'], row['outcomes']),
            axis=1
        )

        return df

# Example usage
def main():
    # Load CSV
    df = pd.read_csv('Data Annotaters Task Statements - Sheet1.csv')

    # Initialize and run advanced annotator
    annotator = AdvancedCybersecurityAnnotator()
    annotated_df = annotator.annotate_incidents(df)

    # Save annotated data
    annotated_df.to_csv('advanced_annotated_incidents.csv', index=False)

    # Print summary
    print(annotated_df[['extracted_date', 'industry', 'targets', 'outcomes', 'severity']].head())

if __name__ == "__main__":
    main()

  extracted_date    industry                               targets  \
0     2024-03-05  Technology                      [Personal, Data]   
1     2024-04-12     Finance                    [Department, Data]   
2     2024-05-20  Technology  [System, Department, Personal, Data]   
3     2024-06-15  Technology                         [Unspecified]   
4     2024-07-03  Technology                          [Department]   

                               outcomes severity  
0               [No Significant Impact]   Medium  
1  [Financial Loss, Operational Impact]     High  
2               [No Significant Impact]      Low  
3                   [Systems Disrupted]   Medium  
4               [No Significant Impact]   Medium  


In [None]:
read_csv = pd.read_csv('/content/advanced_annotated_incidents.csv')

In [None]:
read_csv.head()

Unnamed: 0,Statements,extracted_date,industry,targets,outcomes
0,"On March 5, 2024, a phishing email was sent to...",2024-03-05,Technology,"['Personal', 'Data']",['No Significant Impact']
1,"A ransomware attack on April 12, 2024, encrypt...",,Finance,"['Department', 'Data']","['Financial Loss', 'Operational Impact']"
2,Unauthorized access was detected in the HR dat...,,Technology,"['System', 'Department', 'Personal', 'Data']",['No Significant Impact']
3,"On June 15, 2024, a Distributed Denial of Serv...",2024-06-15,Technology,['Unspecified'],['Systems Disrupted']
4,"A malware infection was identified on July 3, ...",,Technology,['Department'],['No Significant Impact']


In [2]:
import pandas as pd
import re
import dateutil.parser

# --- Define Annotation Mappings ---
# Industries (Add missing companies and industries as needed)
company_to_industry = {
    "TechCorp": "Tech",
    "FinanceSolutions Inc.": "Finance",
    "HealthFirst": "Healthcare",
    "WebServices Ltd.": "Tech",
    "MarketGurus": "Marketing",
    "ShopSafe": "Retail",
    "Innovatech": "Tech",
    "GlobalEnterprises": "Corporate",
    "WebSecure": "Tech",
    "DataSolutions": "Tech",
    "VisionaryTech": "Tech",
    "AutoManufacture Inc.": "Manufacturing",
    "UtilityCorp": "Utilities",
    "MediaStream": "Media",
    "FinanceHub": "Finance",
    "ConsultingExperts": "Tech",
    "DevSolutions": "Tech",
    "SecureNetworks": "Tech",
    "HealthCarePlus": "Healthcare",
    "TechDefend": "Tech",
    "ConstructionPro": "Manufacturing",
    "PeopleFirst": "HR",
    "LogiTech": "Logistics",
    "EduOnline": "Education",
    "ServiceConnect": "Tech",
    "LawSecure": "Legal",
    "PharmaCorp": "Pharmaceutical",
    "MoneyMatters": "Finance",
    "CodeBase": "Tech",
    "ShopEase": "E-Commerce",
    "BioResearch": "Research",
    "SalesPros": "Sales",
    "PayrollSecure": "HR",
    "CreativeDesigns": "Creative",
    "SupportHub": "Support",
    "LogiMaster": "Logistics",
    "MarketWise": "Marketing",
    "CloudServices": "Tech",
    "InventoryPlus": "Inventory",
    "LegalDocs": "Legal",
    "TradeSecure": "Corporate",
    "ITSupport": "Tech",
    "AppTech": "Tech",
    "SecureTech": "Tech",
    "DataSafe": "Tech",
    "HRConnect": "HR",
    "FirewallSecure": "Tech",
    "MediaHub": "Media",
    "SalesSecure": "Sales",
    "LegalEase": "Legal",
    "ITSupportPro": "Tech",
    "SecureSettings": "Tech",
    "EmployeePortal": "HR",
    "ShopOnline": "E-Commerce",
    "DataStore": "Tech",
    "HREngage": "HR",
    "CustomerCare": "CustomerService",
    "OfficeSecure": "Tech",
    "FinanceGuard": "Finance",
    "ITInfra": "Tech",
    "InventorySafe": "Inventory",
    "CloudStore": "Tech",
    "AnalyticsHub": "Tech",
    "ExecSecure": "Corporate",
    "ProcureManage": "Procurement",
    "SecurityPro": "Security",
    "FinSoftware": "Finance",
    "PortalOnline": "Tech",
    "BankNet": "Banking",
    "CRMPro": "Tech",
    "FinancePro": "Finance",
    "ConfidentialInc": "Tech",
    "MarketingHub": "Marketing",
    "NetSecure": "Tech",
    "SupportSolutions": "Support",
    "ProjectManage": "Tech",
    "FinanceDept": "Finance",
    "LogiManage": "Logistics",
    "StreamFast": "Media",
    "BillMaster": "Tech",
    "RnDInnovate": "Research",
}

# Attack Types (Order matters! More specific first)
attack_types = [
    ("spear-phishing", "Spear-Phishing"),
    ("phishing email", "Phishing"),
    ("ransomware attack", "Ransomware"),
    ("ddos attack", "DDoS"),
    ("malware infection", "Malware"),
    ("insider threat", "Insider Threat"),
    ("data breach", "Data Breach"),
    ("zero-day vulnerability", "Zero-Day Exploit"),
    ("social engineering", "Social Engineering"),
    ("unauthorized access", "Unauthorized Access"),
    ("email spoofing", "Email Spoofing"),
    ("malware", "Malware"),
    ("phishing", "Phishing"),
]

# Attack Vectors (Order matters! More specific first)
attack_vectors = [
    ("email", "Email"),
    ("zero-day vulnerability", "Exploit"),
    ("social engineering", "Social Engineering"),
    ("unauthorized access", "Unauthorized Access"),
    ("physical access", "Physical Access"),
]

# Target/Impact (Order matters! More specific first)
targets = [
    ("login credentials", "Login Credentials"),
    ("financial data", "Financial Data"),
    ("customer information", "Customer Information"),
    ("employee data", "Employee Data"),
    ("service disruption", "Service Disruption"),
    ("data exfiltration", "Data Exfiltration"),
    ("encrypted", "System Encryption"),
    ("proprietary code", "Proprietary Code/Trade Secrets"),
    ("operational disruption", "Operational Disruption"),
]

# --- Helper Functions ---
def extract_date(statement):
    date_match = re.search(r"On (\w+ \d{1,2}, \d{4})", statement)
    if date_match:
        date_str = date_match.group(1)
        try:
            parsed_date = dateutil.parser.parse(date_str)
            return parsed_date.strftime("%Y-%m-%d")
        except:
            pass
    return "Unknown"

def extract_company(statement):
    company_match = re.search(r"at ([A-Za-z0-9\s]+),", statement)
    return company_match.group(1).strip() if company_match else "Unknown"

def get_industry(company):
    return company_to_industry.get(company, "Unknown")

def extract_attack_type(statement):
    for pattern, label in attack_types:
        if re.search(pattern, statement, re.IGNORECASE):
            return label
    return "Other"

def extract_attack_vector(statement):
    for pattern, label in attack_vectors:
        if re.search(pattern, statement, re.IGNORECASE):
            return label
    return "Other"

def extract_target(statement):
    for pattern, label in targets:
        if re.search(pattern, statement, re.IGNORECASE):
            return label
    return "Other"

# --- Main Processing ---
def main():
    # Load input CSV
    df = pd.read_csv('Data Annotaters Task Statements - Sheet1.csv',
                     names=['Statements'],
                     header=None)

    # Apply annotation functions
    df['Annotated_Date'] = df['Statements'].apply(extract_date)
    df['Company'] = df['Statements'].apply(extract_company)
    df['Industry'] = df['Company'].apply(get_industry)
    df['Attack Type'] = df['Statements'].apply(extract_attack_type)
    df['Attack Vector'] = df['Statements'].apply(extract_attack_vector)
    df['Target/Impact'] = df['Statements'].apply(extract_target)

    # Save output CSV
    df.to_csv('annotated_cybersecurity.csv', index=False)

if __name__ == "__main__":
    main()

In [3]:
data_annotation = pd.read_csv('/content/annotated_cybersecurity.csv')

In [4]:
data_annotation.head()

Unnamed: 0,Statements,Annotated_Date,Company,Industry,Attack Type,Attack Vector,Target/Impact
0,Target/Impact,Unknown,Unknown,Unknown,Other,Other,Other
1,Other,Unknown,Unknown,Unknown,Other,Other,Other
2,Login Credentials,Unknown,Unknown,Unknown,Other,Other,Login Credentials
3,Financial Data,Unknown,Unknown,Unknown,Other,Other,Financial Data
4,Other,Unknown,Unknown,Unknown,Other,Other,Other
