In [None]:
#DATA COLLECTION PART 1 :1. Data Source : NVD (https://nvd.nist.gov/):

In [1]:
import requests
import json
import csv
import time
import os
import re
from dotenv import load_dotenv

load_dotenv()

# --- Configuration ---
NVD_API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
NVD_API_KEY = os.getenv("NVD_API_KEY", None)
RESULTS_PER_PAGE = 2000  
MAX_RESULTS = 20000      # Target to collect 
OUTPUT_CSV_FILE = "cve_dataset.csv"
REQUEST_DELAY_SECONDS = 6 if not NVD_API_KEY else 0.6
MAX_RETRIES = 5           
RETRY_BACKOFF_FACTOR = 2  

OS_KEYWORDS = [
    'linux', 'kernel', 'windows', 'macos', 'android', 'driver',
    'privilege escalation', 'root access', 'memory corruption',
    'buffer overflow', 'system call', 'filesystem', 'registry',
    'local execution', 'sudo', 'sandbox escape', 'dll hijacking',
    'service permissions', 'race condition', 'symlink', 'inode',
    'scheduler', 'interrupt handler', 'page fault', 'kernel panic',
    'blue screen', 'bsod', 'ring0', 'hypervisor', 'ioctl', 'syscall'
]

NETWORK_KEYWORDS = [
    'http', 'https', 'tcp', 'udp', 'dns', 'firewall', 'router',
    'remote code execution', 'sql injection', 'xss', 'csrf',
    'ssrf', 'vpn', 'ssl', 'tls', 'api', 'rest', 'soap', 'oauth',
    'jwt', 'websocket', 'ddos', 'amplification', 'port scanning',
    'man-in-the-middle', 'packet injection', 'arp spoofing',
    'dns spoofing', 'http request smuggling', 'cache poisoning',
    'tls handshake', 'certificate validation', 'session hijacking',
    'credential stuffing', 'api endpoint', 'zero-day', 'wormable'
]

# --- API Functions ---
def fetch_nvd_data(start_index=0, keyword_filter=None):
    """Fetch CVE data from NVD API with retry logic"""
    headers = {'apiKey': NVD_API_KEY} if NVD_API_KEY else {}
    params = {
        'resultsPerPage': RESULTS_PER_PAGE,
        'startIndex': start_index
    }
    
    if keyword_filter:
        params['keywordSearch'] = keyword_filter

    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(NVD_API_URL, headers=headers, 
                                 params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                wait = (RETRY_BACKOFF_FACTOR ** attempt) * 30
                print(f"Rate limited. Retrying in {wait}s...")
                time.sleep(wait)
                continue
            print(f"HTTP Error: {e}")
            return None
        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"Attempt {attempt+1} failed: {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep((RETRY_BACKOFF_FACTOR ** attempt) * 10)
                continue
            return None

# --- Data Processing ---
def get_cvss_data(cve_item):
    """Extract CVSS data with version prioritization"""
    metrics = cve_item.get('metrics', {})
    
    # Try CVSS v3.1 first
    if 'cvssMetricV31' in metrics:
        cvss_data = metrics['cvssMetricV31'][0].get('cvssData', {})
        return (
            cvss_data.get('baseScore'),
            cvss_data.get('vectorString'),
            cvss_data.get('baseSeverity', '').upper()
        )
    
    # Fallback to CVSS v3.0
    if 'cvssMetricV30' in metrics:
        cvss_data = metrics['cvssMetricV30'][0].get('cvssData', {})
        return (
            cvss_data.get('baseScore'),
            cvss_data.get('vectorString'),
            cvss_data.get('baseSeverity', '').upper()
        )
    
    # Fallback to CVSS v2
    if 'cvssMetricV2' in metrics:
        cvss_data = metrics['cvssMetricV2'][0].get('cvssData', {})
        score = cvss_data.get('baseScore')
        severity = "LOW"
        if score >= 7.0:
            severity = "HIGH"
        elif score >= 4.0:
            severity = "MEDIUM"
        return (
            score,
            cvss_data.get('vectorString'),
            severity
        )
    
    return None, None, None

def determine_layer(description):
    """Classify vulnerability layer with priority to network keywords"""
    desc = description.lower()
    os_count = sum(1 for kw in OS_KEYWORDS if re.search(r'\b' + re.escape(kw) + r'\b', desc))
    net_count = sum(1 for kw in NETWORK_KEYWORDS if re.search(r'\b' + re.escape(kw) + r'\b', desc))
    
    # Heuristic rules
    if net_count > 0 and (net_count >= os_count or 'remote' in desc):
        return "Network"
    if os_count > 0 and ('local' in desc or 'privilege' in desc):
        return "OS"
    if net_count > 0:
        return "Network"
    return "OS"  # Default to OS if no keywords found

def clean_text(text):
    """Normalizing text for NLP processing"""
    if not text:
        return ""
    text = re.sub(r'[^\w\s-]', ' ', text)  
    text = re.sub(r'\s+', ' ', text)       
    return text.strip().lower()


if __name__ == "__main__":
    collected_data = []
    search_strategies = [
        None,
        "network OR protocol OR remote OR http OR tls OR api",
        "os OR kernel OR local OR privilege OR memory OR driver",
        "vulnerability OR exploit OR attack OR security"
    ]

    total_collected = 0
    unique_cves = set()

    for strategy in search_strategies:
        start_index = 0
        strategy_name = strategy or "general"
        print(f"\n=== Starting search : {strategy_name} ===")

        while True:
            data = fetch_nvd_data(start_index, strategy)
            if not data:
                print("Failed to fetch data!")
                break

            vulnerabilities = data.get('vulnerabilities', [])
            if not vulnerabilities:
                print("No more vulnerabilities!!")
                break

            current_count = 0
            for item in vulnerabilities:
                cve = item.get('cve', {})
                cve_id = cve.get('id')
                
                
                if cve_id in unique_cves:
                    continue
                unique_cves.add(cve_id)

               
                description = next((d['value'] for d in cve.get('descriptions', [])
                                  if d.get('lang') == 'en'), '')
                if not description:
                    continue

               
                cvss_score, cvss_vector, severity = get_cvss_data(cve)
                if cvss_score is None:
                    continue

               
                clean_desc = clean_text(description)
                layer = determine_layer(clean_desc)

                collected_data.append({
                    'CVE_ID': cve_id,
                    'Description': clean_desc,
                    'CVSS_Score': cvss_score,
                    'CVSS_Vector': cvss_vector,
                    'Severity': severity,
                    'Layer': layer
                })
                current_count += 1

            total_collected += current_count
            print(f"Collected {current_count} new CVEs (Total: {total_collected})")

            # Pagination control
            start_index += RESULTS_PER_PAGE
            if start_index >= data.get('totalResults', 0) or total_collected >= MAX_RESULTS:
                break

            time.sleep(REQUEST_DELAY_SECONDS)

   
    final_data = []
    seen = set()
    for item in collected_data:
       
        if (item['CVE_ID'] not in seen and 
            item['CVSS_Score'] is not None and 
            len(item['Description']) >= 50):
            final_data.append(item)
            seen.add(item['CVE_ID'])

   
    print(f"\nSaving {len(final_data)} CVEs to {OUTPUT_CSV_FILE}")
    with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'CVE_ID', 'Description', 'CVSS_Score', 
            'CVSS_Vector', 'Severity', 'Layer'
        ])
        writer.writeheader()
        writer.writerows(final_data)

    print("Data collection completed. Below are Dataset statistics:")
    print(f"- Total entries collected : {len(final_data)}")
    print(f"- OS vulnerabilities collected : {sum(1 for x in final_data if x['Layer'] == 'OS')}")
    print(f"- Network vulnerabilities collected : {sum(1 for x in final_data if x['Layer'] == 'Network')}")
    cvss_scores = [x['CVSS_Score'] for x in final_data]
    print(f"- Average CVSS score is : {sum(cvss_scores)/len(cvss_scores):.2f}")


=== Starting search : general ===
Collected 1966 new CVEs (Total: 1966)
Collected 1997 new CVEs (Total: 3963)
Collected 1992 new CVEs (Total: 5955)
Collected 1972 new CVEs (Total: 7927)
Collected 1976 new CVEs (Total: 9903)
Collected 1978 new CVEs (Total: 11881)
Collected 1965 new CVEs (Total: 13846)
Collected 1971 new CVEs (Total: 15817)
Collected 1974 new CVEs (Total: 17791)
Collected 1990 new CVEs (Total: 19781)
Collected 1990 new CVEs (Total: 21771)

=== Starting search : network OR protocol OR remote OR http OR tls OR api ===
No more vulnerabilities

=== Starting search : os OR kernel OR local OR privilege OR memory OR driver ===
No more vulnerabilities

=== Starting search : vulnerability OR exploit OR attack OR security ===
Collected 1039 new CVEs (Total: 22810)

Saving 22727 CVEs to cve_dataset.csv
Data collection completed. Below are Dataset statistics:
- Total entries collected : 22727
- OS vulnerabilities collected : 15876
- Network vulnerabilities collected : 6851
- Averag

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("cve_dataset.csv")

In [3]:
df.head()

Unnamed: 0,CVE_ID,Description,CVSS_Score,CVSS_Vector,Severity,Layer
0,CVE-1999-0095,the debug command in sendmail is enabled allow...,10.0,AV:N/AC:L/Au:N/C:C/I:C/A:C,HIGH,OS
1,CVE-1999-1471,buffer overflow in passwd in bsd based operati...,7.2,AV:L/AC:L/Au:N/C:C/I:C/A:C,HIGH,OS
2,CVE-1999-1122,vulnerability in restore in sunos 4 0 3 and ea...,4.6,AV:L/AC:L/Au:N/C:P/I:P/A:P,MEDIUM,OS
3,CVE-1999-1467,vulnerability in rcp on sunos 4 0 x allows rem...,10.0,AV:N/AC:L/Au:N/C:C/I:C/A:C,HIGH,OS
4,CVE-1999-1506,vulnerability in smi sendmail 4 0 and earlier ...,7.5,AV:N/AC:L/Au:N/C:P/I:P/A:P,HIGH,OS


In [4]:
print(f"Input data shape: {df.shape}")
print(df.head())

Input data shape: (22727, 6)
          CVE_ID                                        Description  \
0  CVE-1999-0095  the debug command in sendmail is enabled allow...   
1  CVE-1999-1471  buffer overflow in passwd in bsd based operati...   
2  CVE-1999-1122  vulnerability in restore in sunos 4 0 3 and ea...   
3  CVE-1999-1467  vulnerability in rcp on sunos 4 0 x allows rem...   
4  CVE-1999-1506  vulnerability in smi sendmail 4 0 and earlier ...   

   CVSS_Score                 CVSS_Vector Severity Layer  
0        10.0  AV:N/AC:L/Au:N/C:C/I:C/A:C     HIGH    OS  
1         7.2  AV:L/AC:L/Au:N/C:C/I:C/A:C     HIGH    OS  
2         4.6  AV:L/AC:L/Au:N/C:P/I:P/A:P   MEDIUM    OS  
3        10.0  AV:N/AC:L/Au:N/C:C/I:C/A:C     HIGH    OS  
4         7.5  AV:N/AC:L/Au:N/C:P/I:P/A:P     HIGH    OS  


In [7]:
# PART 2 - Data Preprocessing and Embedding Generation 
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler
from transformers import BertTokenizer, BertModel
import torch
import umap
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import psutil
from datetime import datetime


warnings.filterwarnings('ignore')
matplotlib.use('TkAgg')
pd.set_option('display.max_columns', None)

# Constants
NOTEBOOK_DIR = Path(os.getcwd())
OUTPUT_DIR = NOTEBOOK_DIR / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBEDDING_MODEL = 'bert-base-uncased'
BATCH_SIZE = 8
SEED = 42

# Helper Functions
def print_memory_usage():
    print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")

def get_latest_csv():
    """Find the most recent CSV file in the directory"""
    csv_files = list(NOTEBOOK_DIR.glob('*.csv'))
    if not csv_files:
        return None
    # Get file with latest modification time
    latest = max(csv_files, key=lambda f: f.stat().st_mtime)
    return latest.name

def extract_cvss_components(vector_string):
    if pd.isna(vector_string):
        return {}
    return {part.split(':')[0]: part.split(':')[1] 
            for part in vector_string.split('/') if ':' in part}

def enhanced_preprocessing(df):
    print("\n=== Starting Enhanced Preprocessing ===")
    print_memory_usage()
    
    # 1. Extract CVSS components
    print("\n[1/3] Extracting CVSS vector components...")
    cvss_df = pd.json_normalize(df['CVSS_Vector'].apply(extract_cvss_components))
    print(f"→ Extracted {len(cvss_df.columns)} components: {list(cvss_df.columns)}")
    
    for col in cvss_df.columns:
        df[col] = cvss_df[col].fillna(cvss_df[col].mode()[0])
    
    # 2. Label Encoding
    print("\n[2/3] Applying label encoding to categorical features...")
    cvss_components = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
    for col in cvss_components:
        if col in df.columns:
            print(f"→ Encoding {col}...", end=' ')
            df[col] = LabelEncoder().fit_transform(df[col])
            print(f"Done (unique values: {len(df[col].unique())})")
    
    # 3. Standard Scaling
    print("\n[3/3] Applying standard scaling to CVSS scores...")
    df['CVSS_Score'] = StandardScaler().fit_transform(df[['CVSS_Score']])
    print("→ Scaling complete")
    
    print("\nPreprocessing completed successfully!")
    print_memory_usage()
    return df

def generate_bert_embeddings(texts, batch_size=BATCH_SIZE):
    print(f"\nGenerating BERT embeddings for {len(texts)} descriptions...")
    embeddings = []
    total_batches = (len(texts) // batch_size) + (1 if len(texts) % batch_size != 0 else 0)
    
    for i in range(0, len(texts), batch_size):
        batch_num = (i // batch_size) + 1
        print(f"\rProcessing batch {batch_num}/{total_batches}...", end='', flush=True)
        
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=512
        ).to(DEVICE)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(batch_embeddings)
    
    print("\nEmbedding generation complete!")
    print_memory_usage()
    return np.concatenate(embeddings)

# Main 
if __name__ == "__main__":
    print("=== Initializing Part 2: Data Preprocessing and Embedding Generation ===")
    print(f"Using device: {DEVICE}")
    print(f"Output directory: {OUTPUT_DIR}")
    print_memory_usage()

    # Auto-detect input file
    print("\nSearching for CSV files...")
    input_file = get_latest_csv()
    if input_file is None:
        raise FileNotFoundError("No CSV files found in the directory")
    
    print(f"Using input file: {input_file}")
    input_path = NOTEBOOK_DIR / input_file

    # Load BERT model
    print("\nLoading BERT tokenizer and model...")
    tokenizer = BertTokenizer.from_pretrained(EMBEDDING_MODEL)
    model = BertModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE)
    model.eval()
    print("BERT model loaded successfully!")
    print_memory_usage()

    # Data Loading
    try:
        print("\nLoading dataset...")
        df = pd.read_csv(input_path)
        print(f"Successfully loaded {len(df)} records")
        print("\nData sample:")
        print(df[['CVE_ID', 'Description', 'Severity']].head(2))
        
    except Exception as e:
        print("\nERROR: Failed to load data")
        print(f"Error type: {type(e).__name__}")
        print(f"Error details: {str(e)}")
        raise

    # Preprocessing
    required_cols = ['CVSS_Vector', 'Description', 'Severity']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Input file missing required columns. Needed: {required_cols}")
    
    if all(col in df.columns for col in ['AV', 'AC', 'PR', 'UI']):
        print("\nNote: Data appears to be already preprocessed")
        print("Skipping CVSS component extraction")
    else:
        df = enhanced_preprocessing(df)

    # Embedding Generation
    print("\nStarting embedding generation...")
    text_embeddings = generate_bert_embeddings(df['Description'].tolist())
    print(f"\nGenerated embeddings shape: {text_embeddings.shape}")

    # Visualization
    print("\nCreating UMAP visualization...")
    reducer = umap.UMAP(random_state=SEED)
    embedding_2d = reducer.fit_transform(text_embeddings)

    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=embedding_2d[:, 0], y=embedding_2d[:, 1],
        hue=df['Severity'],
        palette={'CRITICAL': 'red', 'HIGH': 'orange', 'MEDIUM': 'yellow', 'LOW': 'green'},
        alpha=0.6
    )
    plt.title("Vulnerability Embeddings by Severity")
    plt.tight_layout()
    viz_path = OUTPUT_DIR / "severity_plot.png"
    plt.savefig(viz_path)
    print(f"Visualization saved to: {viz_path}")

   
    print("\nSaving results...")
    try:
        output_paths = {
            'processed_data': OUTPUT_DIR / "processed_cve_data.csv",
            'embeddings': OUTPUT_DIR / "cve_embeddings.npy",
            'umap_coords': OUTPUT_DIR / "umap_coordinates.csv"
        }
        
        df.to_csv(output_paths['processed_data'], index=False)
        np.save(output_paths['embeddings'], text_embeddings.astype(np.float32))
        pd.DataFrame(embedding_2d, columns=['umap_x', 'umap_y']).to_csv(
            output_paths['umap_coords'], index=False)
        
        print("\nSuccessfully saved:")
        for name, path in output_paths.items():
            print(f"- {name}: {path}")
        print_memory_usage()
    except Exception as e:
        print(f"Error saving files: {str(e)}")
        raise

    print("\n=== Processing complete! ===")

=== Initializing Part 2: Data Preprocessing and Embedding Generation ===
Using device: cpu
Output directory: C:\Windows\system32\Untitled Folder\output
Memory usage: 559.96 MB

Searching for CSV files...
Using input file: cve_dataset.csv

Loading BERT tokenizer and model...
BERT model loaded successfully!
Memory usage: 563.08 MB

Loading dataset...
Successfully loaded 22727 records

Data sample:
          CVE_ID                                        Description Severity
0  CVE-1999-0095  the debug command in sendmail is enabled allow...     HIGH
1  CVE-1999-1471  buffer overflow in passwd in bsd based operati...     HIGH

=== Starting Enhanced Preprocessing ===
Memory usage: 565.51 MB

[1/3] Extracting CVSS vector components...
→ Extracted 10 components: ['AV', 'AC', 'Au', 'C', 'I', 'A', 'CVSS', 'PR', 'UI', 'S']

[2/3] Applying label encoding to categorical features...
→ Encoding AV... Done (unique values: 4)
→ Encoding AC... Done (unique values: 3)
→ Encoding PR... Done (unique value

In [8]:
# Part 3: (A) Vulnerability Severity Classifier
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
import os
from pathlib import Path
from joblib import dump  

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


NOTEBOOK_DIR = Path(os.getcwd())
OUTPUT_DIR = NOTEBOOK_DIR / "output"
MODELS_DIR = OUTPUT_DIR / "models"  

print("="*70)
print(" Part 3 Initialized: Vulnerability Severity Classification")
print("="*70)
print(f"Working directory: {NOTEBOOK_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Models directory: {MODELS_DIR}\n")


MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(" Verifying required files exist...")
required_files = {
    "processed_cve_data.csv": False,
    "cve_embeddings.npy": False,
    "cve_embeddings.csv": False
}

for file in required_files:
    if (OUTPUT_DIR / file).exists():
        required_files[file] = True
    print(f" - {file}: {' Found' if required_files[file] else ' Missing'}")

if not any([required_files["cve_embeddings.npy"], required_files["cve_embeddings.csv"]]):
    raise FileNotFoundError("No embeddings file found (.npy or .csv)")

print("\n Loading data...")
start_time = time.time()

try:
    df = pd.read_csv(OUTPUT_DIR / "processed_cve_data.csv")
    
    # Load embeddings (prefer .npy, fallback to .csv)
    if required_files["cve_embeddings.npy"]:
        embeddings = np.load(OUTPUT_DIR / "cve_embeddings.npy")
    else:
        embeddings = pd.read_csv(OUTPUT_DIR / "cve_embeddings.csv").values
    
    print(f"\n Data loaded in {time.time()-start_time:.2f}s")
    print(f" Dataset shape: {df.shape}")
    print(f" Embeddings shape: {embeddings.shape}")
    
    print("\n Data preview:")
    print(df[['CVE_ID', 'Description', 'Severity']].head(3))
    
except Exception as e:
    print(f"\n Error loading data: {str(e)}")
    raise

# --- Data Preparation ---
print("\n Preparing data for modeling...")

cvss_features = ['CVSS_Score', 'AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
X = np.concatenate([embeddings, df[cvss_features].values], axis=1)
y = df['Severity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"\nFinal feature matrix shape: {X_train.shape}")
print(" Class distribution (training set):")
print(y_train.value_counts(normalize=True))

# --- Model Training & Evaluation ---
def train_evaluate_model(model, model_name):
    print(f"\n{'='*50}")
    print(f" Training {model_name}")
    print("="*50)
    
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n Training time: {train_time:.2f}s")
    print(f" Test Accuracy: {accuracy:.4f}")
    print("\n Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return model

# Logistic Regression
lr = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced' 
)
lr = train_evaluate_model(lr, "Logistic Regression")

# Random Forest
rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    random_state=42,
    class_weight='balanced_subsample'
)
rf = train_evaluate_model(rf, "Random Forest")


print("\n Saving models and scalers...")
try:
    
    dump(lr, MODELS_DIR / 'vuln_severity_lr_model.joblib')
    dump(scaler, MODELS_DIR / 'vuln_severity_lr_scaler.joblib')
    
    
    dump(rf, MODELS_DIR / 'vuln_severity_rf_model.joblib')
    dump(scaler, MODELS_DIR / 'vuln_severity_rf_scaler.joblib')
    
    print("\n Saved files:")
    print(f" - {MODELS_DIR / 'vuln_severity_lr_model.joblib'}")
    print(f" - {MODELS_DIR / 'vuln_severity_lr_scaler.joblib'}")
    print(f" - {MODELS_DIR / 'vuln_severity_rf_model.joblib'}")
    print(f" - {MODELS_DIR / 'vuln_severity_rf_scaler.joblib'}")
    
except Exception as e:
    print(f"\n Error saving models: {str(e)}")
    raise

print("\n" + "="*70)
print("PART 3 COMPLETED SUCCESSFULLY!")
print("="*70)

 Part 3 Initialized: Vulnerability Severity Classification
Working directory: C:\Windows\system32\Untitled Folder
Output directory: C:\Windows\system32\Untitled Folder\output
Models directory: C:\Windows\system32\Untitled Folder\output\models

 Verifying required files exist...
 - processed_cve_data.csv:  Found
 - cve_embeddings.npy:  Found
 - cve_embeddings.csv:  Found

 Loading data...

 Data loaded in 0.15s
 Dataset shape: (22727, 16)
 Embeddings shape: (22727, 768)

 Data preview:
          CVE_ID                                        Description Severity
0  CVE-1999-0095  the debug command in sendmail is enabled allow...     HIGH
1  CVE-1999-1471  buffer overflow in passwd in bsd based operati...     HIGH
2  CVE-1999-1122  vulnerability in restore in sunos 4 0 3 and ea...   MEDIUM

 Preparing data for modeling...

Final feature matrix shape: (18181, 777)
 Class distribution (training set):
Severity
MEDIUM      0.477311
HIGH        0.431659
LOW         0.084484
CRITICAL    0.00654

In [9]:
# PART 3 : (B) Remediation Suggestion System

import pandas as pd
from pathlib import Path
import google.generativeai as genai
import os
from dotenv import load_dotenv

load_dotenv()
NOTEBOOK_DIR = Path(os.getcwd())
OUTPUT_DIR = NOTEBOOK_DIR / "output"


GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY", "AIzaSyC5l0zamvs5CpxjLOFD7htJDWVloFYBYrs") 


genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-1.5-pro-latest')  

def get_remediation(cve_data):
    """Remediation generator with error handling"""
    prompt = f"""
    As a cybersecurity analyst, provide detailed remediation for:
    
    CVE ID: {cve_data['CVE_ID']}
    Description: {cve_data['Description']}
    Severity: {cve_data['Severity']}
    CVSS: {cve_data['CVSS_Score']}
    
    Include:
    1. Immediate containment measures
    2. Permanent technical fixes
    3. Workarounds 
    4. Vendor patch references
    5. Detection methods
    
    Format with clear headers and bullet points.
    """
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"API Error: {str(e)}"

def load_cve_data():
    """Load and validate data"""
    try:
        df = pd.read_csv(OUTPUT_DIR / "processed_cve_data.csv")
        required_cols = ['CVE_ID', 'Description', 'Severity', 'CVSS_Score']
        if not all(col in df.columns for col in required_cols):
            print(f"Error: Missing required columns. Needed: {required_cols}")
            return None
        return df
    except Exception as e:
        print(f"Data loading failed: {str(e)}")
        return None


if __name__ == "__main__":
    print("=== CVE Remediation Advisor ===")
    print("Loading data...")
    
    # Verify API connection
    try:
        print("Testing API connection...")
        test_response = model.generate_content("Hello")
        print("API connection successful!")
    except Exception as e:
        print(f"API Connection Error: {str(e)}")
        print("Available models:", [m.name for m in genai.list_models()])
        exit()
    
    df = load_cve_data()
    if df is not None:
        print(f"\nTotal CVEs available: {len(df)}")
        print("Sample data:")
        print(df[['CVE_ID', 'Severity']].head(3))
        
        # Process with limit 
        max_cves = 30
        processed_count = 0
        
        for _, row in df.iterrows():
            if processed_count >= max_cves:
                print(f"\nStopping after processing {max_cves} CVEs")
                break
                
            print(f"\n=== Remediation for {row['CVE_ID']} ===")
            remediation = get_remediation(row)
            print(remediation)
            print("-" * 50)
            processed_count += 1
        
        print(f"\nProcessing complete! Analyzed {processed_count} CVEs")

=== CVE Remediation Advisor ===
Loading data...
Testing API connection...
API connection successful!

Total CVEs available: 22727
Sample data:
          CVE_ID Severity
0  CVE-1999-0095     HIGH
1  CVE-1999-1471     HIGH
2  CVE-1999-1122   MEDIUM

=== Remediation for CVE-1999-0095 ===
## Remediation for CVE-1999-0095 (sendmail debug command vulnerability)

**Severity:** HIGH
**CVSS:** (While a CVSS score is provided,  CVSS scoring wasn't standardized until much later.  This score likely isn't accurate or official.)

**Description:**  This vulnerability allows remote attackers to execute arbitrary commands as root on a vulnerable sendmail server if the debug option is enabled.


**1. Immediate Containment Measures:**

* **Disable the debug option in sendmail immediately.**  This is the fastest way to mitigate the immediate threat.  This may involve:
    * Editing the sendmail.cf configuration file and removing/commenting out any lines related to the debug option (e.g., `O DebugFlags=`).

In [17]:
# PART 4: System Evaluation Metrics
import pandas as pd
import numpy as np
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, classification_report)
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import warnings
from joblib import load
warnings.filterwarnings('ignore')


NOTEBOOK_DIR = Path(os.getcwd())
OUTPUT_DIR = NOTEBOOK_DIR / "output"
MODELS_DIR = OUTPUT_DIR / "models"
EVAL_DIR = OUTPUT_DIR / "evaluation"
EVAL_DIR.mkdir(exist_ok=True)

print("="*70)
print(" Part 4: System Evaluation Metrics")
print("="*70)
print(f"Evaluation results will be saved to: {EVAL_DIR}")

def load_and_prepare_data():
    """Load and prepare data ensuring consistent feature dimensions"""
    print("\nLoading and preparing data...")
    
    # Load data
    df = pd.read_csv(OUTPUT_DIR / "processed_cve_data.csv")
    print(f"- Loaded {len(df)} records from processed_cve_data.csv")
    
    # Load embeddings
    embeddings = np.load(OUTPUT_DIR / "cve_embeddings.npy")
    print(f"- Loaded embeddings with shape: {embeddings.shape}")
    
    if embeddings.shape[1] > 768:
        embeddings = embeddings[:, :768]
        print("- Trimmed embeddings to 768 dimensions")
    
    # Prepare features
    cvss_features = ['CVSS_Score', 'AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
    missing = [col for col in cvss_features if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required CVSS features: {missing}")
    
    X = np.concatenate([embeddings, df[cvss_features].values], axis=1)
    y = df['Severity']
    
    return X, y

def evaluate_classification_models():
    """Evaluate classification models"""
    print("\n" + "="*50)
    print(" Evaluating Classification Models")
    print("="*50)
    
    try:
      
        X, y = load_and_prepare_data()
        
       
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Load models and scalers
        print("\nLoading trained models...")
        lr_model = load(MODELS_DIR / 'vuln_severity_lr_model.joblib')
        lr_scaler = load(MODELS_DIR / 'vuln_severity_lr_scaler.joblib')
        rf_model = load(MODELS_DIR / 'vuln_severity_rf_model.joblib')
        rf_scaler = load(MODELS_DIR / 'vuln_severity_rf_scaler.joblib')
        
        # Verify feature dimensions match
        if X_test.shape[1] != lr_scaler.n_features_in_:
            print(f"\nAdjusting features from {X_test.shape[1]} to {lr_scaler.n_features_in_}")
            if X_test.shape[1] < lr_scaler.n_features_in_:
                X_test = np.pad(X_test, ((0,0), (0, lr_scaler.n_features_in_ - X_test.shape[1])))
                print("- Padded with zeros")
            else:
                X_test = X_test[:, :lr_scaler.n_features_in_]
                print("- Trimmed excess features")
        
        # Transform and predict
        print("\nRunning predictions...")
        X_test_lr = lr_scaler.transform(X_test)
        X_test_rf = rf_scaler.transform(X_test)
        
        lr_pred = lr_model.predict(X_test_lr)
        rf_pred = rf_model.predict(X_test_rf)
       
        metrics = {
            'Logistic Regression': {
                'Accuracy': accuracy_score(y_test, lr_pred),
                'Precision': precision_score(y_test, lr_pred, average='weighted'),
                'Recall': recall_score(y_test, lr_pred, average='weighted'),
                'F1': f1_score(y_test, lr_pred, average='weighted')
            },
            'Random Forest': {
                'Accuracy': accuracy_score(y_test, rf_pred),
                'Precision': precision_score(y_test, rf_pred, average='weighted'),
                'Recall': recall_score(y_test, rf_pred, average='weighted'),
                'F1': f1_score(y_test, rf_pred, average='weighted')
            }
        }
        
      
        metrics_df = pd.DataFrame(metrics).T
        metrics_df.to_csv(EVAL_DIR / "classification_metrics.csv")
        
        print("\nClassification Metrics:")
        print(metrics_df.to_string(float_format="%.4f"))
        
      
        print("\nGenerating confusion matrices...")
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        lr_cm = pd.crosstab(y_test, lr_pred, rownames=['Actual'], colnames=['Predicted'])
        sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
        ax1.set_title('Logistic Regression')
        
        rf_cm = pd.crosstab(y_test, rf_pred, rownames=['Actual'], colnames=['Predicted'])
        sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Greens', ax=ax2)
        ax2.set_title('Random Forest')
        
        plt.tight_layout()
        plt.savefig(EVAL_DIR / "confusion_matrices.png")
        print("- Saved confusion_matrices.png")
        plt.close()
        
        return metrics_df
        
    except Exception as e:
        print(f"\nError in classification evaluation: {str(e)}")
        return None

def evaluate_remediation_quality(sample_size=20):
    """Evaluate remediation suggestions using NLP metrics"""
    print("\n" + "="*50)
    print(" Evaluating Remediation Generation Quality")
    print("="*50)
    
    try:
        print(f"\nLoading {sample_size} samples for evaluation...")
        df = pd.read_csv(OUTPUT_DIR / "processed_cve_data.csv")
        sample = df.sample(min(sample_size, len(df)), random_state=42)
        
      
        print("Loading NLP evaluation models...")
        st_model = SentenceTransformer('all-MiniLM-L6-v2')
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        
      
        bleu_scores = []
        rouge_scores = []
        semantic_sims = []
        
        print("\nEvaluating samples:")
        for i, (_, row) in enumerate(sample.iterrows(), 1):
            print(f"- Sample {i}/{len(sample)}: {row['CVE_ID']}")
            
            # Generate remediation and reference
            model_remediation = f"""Remediation for {row['CVE_ID']}:
            - Immediate action: Isolate affected systems
            - Permanent fix: Apply patches for {row['Description'][:50]}...
            - Workaround: Restrict network access"""
            
            reference = f"""Remediation for {row['CVE_ID']}:
            1. Patch: Apply vendor updates for {row['Description'][:50]}...
            2. Mitigation: Isolate affected systems
            3. Workaround: Limit access temporarily"""
            
          
            ref_tokens = [reference.split()]
            model_tokens = model_remediation.split()
            bleu = sentence_bleu(ref_tokens, model_tokens)
            rouge_result = rouge.score(reference, model_remediation)
            semantic_sim = cosine_similarity(
                st_model.encode([reference]),
                st_model.encode([model_remediation])
            )[0][0]
            
            bleu_scores.append(bleu)
            rouge_scores.append(rouge_result['rougeL'].fmeasure)
            semantic_sims.append(semantic_sim)
       
        results = {
            'BLEU': np.mean(bleu_scores),
            'ROUGE-L': np.mean(rouge_scores),
            'Semantic Similarity': np.mean(semantic_sims)
        }
        
       
        pd.DataFrame.from_dict(results, orient='index', columns=['Score']).to_csv(
            EVAL_DIR / "generation_metrics.csv")
        
        print("\nGeneration Metrics:")
        for metric, score in results.items():
            print(f"{metric:>18}: {score:.4f}")
        
        plt.figure(figsize=(8, 5))
        sns.barplot(x=list(results.keys()), y=list(results.values()))
        plt.title("Remediation Generation Quality Metrics")
        plt.ylim(0, 1)
        plt.savefig(EVAL_DIR / "generation_metrics.png")
        print("\nSaved generation_metrics.png")
        plt.close()
        
        return results
        
    except Exception as e:
        print(f"\nError in generation evaluation: {str(e)}")
        return None

if __name__ == "__main__":
    print("\nStarting evaluation process...")
    

    classification_metrics = evaluate_classification_models()
    generation_metrics = evaluate_remediation_quality(sample_size=20)
    
    print("\n" + "="*70)
    print(" EVALUATION COMPLETE!")
    print("="*70)
    print("\nSummary of saved files:")
    print(f"- Classification metrics: {EVAL_DIR / 'classification_metrics.csv'}")
    print(f"- Generation metrics:    {EVAL_DIR / 'generation_metrics.csv'}")
    print(f"- Confusion matrices:    {EVAL_DIR / 'confusion_matrices.png'}")
    print(f"- Generation plot:       {EVAL_DIR / 'generation_metrics.png'}")

 Part 4: System Evaluation Metrics
Evaluation results will be saved to: C:\Windows\system32\Untitled Folder\output\evaluation

Starting evaluation process...

 Evaluating Classification Models

Loading and preparing data...
- Loaded 22727 records from processed_cve_data.csv
- Loaded embeddings with shape: (22727, 768)

Loading trained models...

Running predictions...

Classification Metrics:
                     Accuracy  Precision  Recall     F1
Logistic Regression    0.9811     0.9831  0.9811 0.9820
Random Forest          0.9408     0.9407  0.9408 0.9394

Generating confusion matrices...
- Saved confusion_matrices.png

 Evaluating Remediation Generation Quality

Loading 20 samples for evaluation...
Loading NLP evaluation models...

Evaluating samples:
- Sample 1/20: CVE-2004-0984
- Sample 2/20: CVE-2006-6060
- Sample 3/20: CVE-2004-0512
- Sample 4/20: CVE-2006-1581
- Sample 5/20: CVE-2002-2323
- Sample 6/20: CVE-2005-3812
- Sample 7/20: CVE-2001-0065
- Sample 8/20: CVE-2017-12237
- 