# AI-Powered Threat Hunting with Machine Learning

This notebook provides advanced threat hunting capabilities using:
- **Machine Learning** for anomaly detection and behavior analysis
- **Claude AI** for hypothesis generation and pattern analysis
- **Multi-source data correlation** from security tools
- **MITRE ATT&CK framework** mapping
- **Automated hunting workflows**

## Hunting Techniques
1. Behavioral anomaly detection using ML
2. Network traffic pattern analysis
3. User and entity behavior analytics (UEBA)
4. Threat intelligence correlation
5. Lateral movement detection
6. Data exfiltration detection

In [None]:
# Import required libraries
import os
import sys
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import networkx as nx
from anthropic import Anthropic
import boto3

# Add local modules to path
sys.path.append('/home/sagemaker-user/lib')

# Import security tool integrations
from security_integrations.crowdstrike_client import CrowdStrikeClient
from security_integrations.microsoft_client import MicrosoftSecurityClient
from security_integrations.proofpoint_client import ProofpointClient
from security_integrations.threat_hunting import ThreatHuntingEngine

# Configure visualization
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úì Libraries imported successfully")

## 1. Initialize Security Data Sources

In [None]:
# Initialize security tool clients
crowdstrike = CrowdStrikeClient()
microsoft = MicrosoftSecurityClient()
proofpoint = ProofpointClient()

# Initialize threat hunting engine
hunting_engine = ThreatHuntingEngine(
    crowdstrike=crowdstrike,
    microsoft=microsoft,
    proofpoint=proofpoint
)

# Set hunting time window (last 7 days)
hunt_days = 7
start_time = (datetime.utcnow() - timedelta(days=hunt_days)).isoformat() + 'Z'

print(f"‚úì Threat hunting engine initialized")
print(f"Hunting window: Last {hunt_days} days (from {start_time})")

## 2. Data Collection and Aggregation

Collect security data from all sources for analysis

In [None]:
print("üîç Collecting security data from all sources...\n")

# Collect endpoint data from CrowdStrike
print("Collecting CrowdStrike data...")
cs_data = await hunting_engine.collect_crowdstrike_data(
    start_time=start_time,
    include_detections=True,
    include_host_activity=True,
    include_network_activity=True
)
print(f"  - Detections: {len(cs_data['detections'])}")
print(f"  - Hosts monitored: {len(cs_data['hosts'])}")
print(f"  - Network events: {len(cs_data['network_events'])}")

# Collect identity data from Microsoft
print("\nCollecting Microsoft data...")
ms_data = await hunting_engine.collect_microsoft_data(
    start_time=start_time,
    include_sign_ins=True,
    include_alerts=True,
    include_risky_users=True
)
print(f"  - Sign-in events: {len(ms_data['sign_ins'])}")
print(f"  - Alerts: {len(ms_data['alerts'])}")
print(f"  - Risky users: {len(ms_data['risky_users'])}")

# Collect email threat data from Proofpoint
print("\nCollecting Proofpoint data...")
pp_data = await hunting_engine.collect_proofpoint_data(
    interval=f"P{hunt_days}D"
)
print(f"  - Threat events: {pp_data['total_events']}")
print(f"  - Top clickers: {len(pp_data['top_clickers'])}")
print(f"  - VAP users: {len(pp_data['vap_users'])}")

# Aggregate all data
aggregated_data = hunting_engine.aggregate_data(cs_data, ms_data, pp_data)

print(f"\n‚úì Data collection complete")
print(f"Total events aggregated: {len(aggregated_data)}")

## 3. Behavioral Anomaly Detection

Use machine learning to detect anomalous behavior patterns

In [None]:
print("ü§ñ Running ML-based anomaly detection...\n")

# Prepare features for ML
features_df = hunting_engine.prepare_ml_features(aggregated_data)

print(f"Feature matrix shape: {features_df.shape}")
print(f"Features: {', '.join(features_df.columns)}\n")

# User Behavior Anomaly Detection
print("Detecting user behavior anomalies...")
user_features = features_df[[
    'login_hour', 'login_country_diversity', 'failed_login_count',
    'successful_login_count', 'ip_diversity', 'application_diversity'
]].fillna(0)

# Train Isolation Forest
iso_forest = IsolationForest(
    contamination=0.1,
    random_state=42,
    n_estimators=100
)
user_anomalies = iso_forest.fit_predict(user_features)

# Get anomalous users
features_df['user_anomaly_score'] = iso_forest.score_samples(user_features)
anomalous_users = features_df[user_anomalies == -1].sort_values('user_anomaly_score')

print(f"Anomalous users detected: {len(anomalous_users)}\n")

# Display top anomalies
print("Top 10 Anomalous User Behaviors:")
print("=" * 80)
for idx, row in anomalous_users.head(10).iterrows():
    print(f"\nUser: {row.get('user_principal_name', 'Unknown')}")
    print(f"  Anomaly Score: {row['user_anomaly_score']:.4f}")
    print(f"  Failed Logins: {row['failed_login_count']:.0f}")
    print(f"  Country Diversity: {row['login_country_diversity']:.0f}")
    print(f"  IP Diversity: {row['ip_diversity']:.0f}")
    print(f"  Unusual Login Hours: {row['login_hour']:.1f}")

## 4. Network Traffic Pattern Analysis

In [None]:
print("üåê Analyzing network traffic patterns...\n")

# Analyze network connections
network_data = cs_data.get('network_events', [])

if network_data:
    network_df = pd.DataFrame(network_data)
    
    # Detect beaconing behavior (C2 communication)
    print("Detecting beaconing patterns...")
    beaconing_results = hunting_engine.detect_beaconing(
        network_df,
        time_threshold=60,  # seconds
        count_threshold=10
    )
    
    if beaconing_results:
        print(f"\n‚ö†Ô∏è Potential C2 beaconing detected: {len(beaconing_results)} patterns")
        for beacon in beaconing_results[:5]:
            print(f"\n  Host: {beacon['host']}")
            print(f"  Destination: {beacon['destination_ip']}:{beacon['destination_port']}")
            print(f"  Connection frequency: Every {beacon['avg_interval']:.1f} seconds")
            print(f"  Connection count: {beacon['connection_count']}")
    else:
        print("No beaconing patterns detected")
    
    # Detect data exfiltration
    print("\nDetecting potential data exfiltration...")
    exfiltration_results = hunting_engine.detect_data_exfiltration(
        network_df,
        byte_threshold=10 * 1024 * 1024  # 10 MB
    )
    
    if exfiltration_results:
        print(f"\n‚ö†Ô∏è Potential data exfiltration detected: {len(exfiltration_results)} events")
        for exfil in exfiltration_results[:5]:
            print(f"\n  Host: {exfil['host']}")
            print(f"  Destination: {exfil['destination_ip']}")
            print(f"  Bytes transferred: {exfil['bytes_out'] / (1024*1024):.2f} MB")
            print(f"  Timestamp: {exfil['timestamp']}")
    else:
        print("No suspicious data exfiltration detected")
else:
    print("No network data available for analysis")

## 5. Lateral Movement Detection

In [None]:
print("üîÄ Detecting lateral movement...\n")

# Build network graph of host-to-host communications
G = hunting_engine.build_network_graph(cs_data.get('network_events', []))

print(f"Network graph: {G.number_of_nodes()} hosts, {G.number_of_edges()} connections")

# Detect lateral movement patterns
lateral_movement = hunting_engine.detect_lateral_movement(
    G,
    min_connections=3,
    time_window_hours=1
)

if lateral_movement:
    print(f"\n‚ö†Ô∏è Potential lateral movement detected: {len(lateral_movement)} patterns\n")
    
    for i, pattern in enumerate(lateral_movement[:5], 1):
        print(f"{i}. Source: {pattern['source_host']}")
        print(f"   Targets: {', '.join(pattern['target_hosts'][:5])}")
        if len(pattern['target_hosts']) > 5:
            print(f"   ... and {len(pattern['target_hosts']) - 5} more")
        print(f"   Protocol: {pattern['protocol']}")
        print(f"   Risk Score: {pattern['risk_score']}/10")
        print()
    
    # Visualize lateral movement
    if lateral_movement:
        fig, ax = plt.subplots(figsize=(12, 8))
        
        # Create subgraph for top lateral movement pattern
        pattern = lateral_movement[0]
        nodes = [pattern['source_host']] + pattern['target_hosts'][:10]
        subgraph = G.subgraph(nodes)
        
        # Draw network graph
        pos = nx.spring_layout(subgraph)
        nx.draw_networkx_nodes(subgraph, pos, node_color='lightblue', 
                               node_size=500, ax=ax)
        nx.draw_networkx_nodes(subgraph, pos, nodelist=[pattern['source_host']],
                               node_color='red', node_size=700, ax=ax)
        nx.draw_networkx_edges(subgraph, pos, edge_color='gray', 
                               arrows=True, ax=ax)
        nx.draw_networkx_labels(subgraph, pos, font_size=8, ax=ax)
        
        ax.set_title('Lateral Movement Pattern Visualization')
        ax.axis('off')
        plt.tight_layout()
        plt.show()
else:
    print("No lateral movement patterns detected")

## 6. AI-Powered Hypothesis Generation

Use Claude AI to generate threat hunting hypotheses based on findings

In [None]:
print("ü§ñ Generating AI-powered hunting hypotheses...\n")

# Compile findings for AI analysis
findings = {
    'anomalous_users': anomalous_users.head(10).to_dict('records'),
    'beaconing': beaconing_results if network_data else [],
    'lateral_movement': lateral_movement if lateral_movement else [],
    'risky_users': ms_data.get('risky_users', []),
    'top_clickers': pp_data.get('top_clickers', []),
}

# Generate hypotheses with Claude
hypotheses = await hunting_engine.generate_hunting_hypotheses(findings)

print("=" * 80)
print("AI-GENERATED THREAT HUNTING HYPOTHESES")
print("=" * 80)

for i, hypothesis in enumerate(hypotheses, 1):
    print(f"\n{i}. {hypothesis['title']}")
    print(f"   Priority: {hypothesis['priority']}")
    print(f"   MITRE ATT&CK: {', '.join(hypothesis['mitre_tactics'])}")
    print(f"   \n   Description: {hypothesis['description']}")
    print(f"   \n   Hunting Approach:")
    for step in hypothesis['hunting_steps']:
        print(f"     ‚Ä¢ {step}")
    print(f"   \n   Expected Indicators:")
    for indicator in hypothesis['expected_indicators']:
        print(f"     ‚Ä¢ {indicator}")

## 7. Execute Hypothesis-Driven Hunt

Execute a specific hunting hypothesis

In [None]:
# Select hypothesis to investigate (user can change this)
hypothesis_index = 0  # First hypothesis

if hypotheses:
    selected_hypothesis = hypotheses[hypothesis_index]
    
    print(f"üéØ Executing hunt for: {selected_hypothesis['title']}\n")
    print("=" * 80)
    
    # Execute automated hunt
    hunt_results = await hunting_engine.execute_hunt(
        hypothesis=selected_hypothesis,
        data_sources={
            'crowdstrike': cs_data,
            'microsoft': ms_data,
            'proofpoint': pp_data
        }
    )
    
    # Display results
    print(f"\nHunt Status: {hunt_results['status']}")
    print(f"Evidence Found: {len(hunt_results['evidence'])} items\n")
    
    if hunt_results['evidence']:
        print("EVIDENCE DISCOVERED:")
        print("=" * 80)
        
        for i, evidence in enumerate(hunt_results['evidence'][:10], 1):
            print(f"\n{i}. {evidence['description']}")
            print(f"   Source: {evidence['source']}")
            print(f"   Timestamp: {evidence['timestamp']}")
            print(f"   Confidence: {evidence['confidence']}%")
            if evidence.get('indicators'):
                print(f"   Indicators: {', '.join(evidence['indicators'])}")
        
        # AI summary of findings
        print("\n" + "=" * 80)
        print("AI ANALYSIS OF FINDINGS")
        print("=" * 80)
        print(hunt_results['ai_summary'])
        
        # Recommended actions
        print("\n" + "=" * 80)
        print("RECOMMENDED ACTIONS")
        print("=" * 80)
        for action in hunt_results['recommended_actions']:
            print(f"‚Ä¢ {action}")
    else:
        print("No evidence found for this hypothesis.")
else:
    print("No hypotheses available")

## 8. MITRE ATT&CK Mapping

In [None]:
print("üó∫Ô∏è Mapping findings to MITRE ATT&CK Framework...\n")

# Map all findings to MITRE ATT&CK
mitre_mapping = hunting_engine.map_to_mitre_attack(
    findings={
        'anomalies': anomalous_users.to_dict('records'),
        'lateral_movement': lateral_movement if lateral_movement else [],
        'hunt_results': hunt_results if hypotheses else {}
    }
)

# Display MITRE ATT&CK coverage
print("MITRE ATT&CK Techniques Observed:")
print("=" * 80)

for tactic, techniques in mitre_mapping.items():
    print(f"\n{tactic.upper()}:")
    for technique in techniques:
        print(f"  ‚Ä¢ {technique['id']}: {technique['name']}")
        print(f"    Evidence count: {technique['evidence_count']}")
        print(f"    Confidence: {technique['confidence']}%")

# Visualize MITRE ATT&CK heatmap
mitre_df = hunting_engine.create_mitre_heatmap_data(mitre_mapping)

if not mitre_df.empty:
    plt.figure(figsize=(14, 8))
    sns.heatmap(mitre_df, annot=True, fmt='d', cmap='YlOrRd', 
                cbar_kws={'label': 'Evidence Count'})
    plt.title('MITRE ATT&CK Technique Coverage')
    plt.xlabel('Techniques')
    plt.ylabel('Tactics')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 9. Generate Threat Hunting Report

In [None]:
print("üìù Generating threat hunting report...\n")

# Generate comprehensive report
report = hunting_engine.generate_hunting_report(
    hunt_period=f"Last {hunt_days} days",
    anomalies=anomalous_users,
    network_analysis={
        'beaconing': beaconing_results if network_data else [],
        'lateral_movement': lateral_movement if lateral_movement else []
    },
    hypotheses=hypotheses,
    hunt_results=hunt_results if hypotheses else {},
    mitre_mapping=mitre_mapping
)

print("=" * 80)
print("THREAT HUNTING REPORT")
print("=" * 80)
print(report['content'])

# Save report
s3_client = boto3.client('s3')
bucket_name = os.getenv('SAGEMAKER_NOTEBOOKS_BUCKET', 'sagemaker-infosec-notebooks')
report_key = f"threat-hunting-reports/{datetime.utcnow().strftime('%Y-%m-%d')}.md"

s3_client.put_object(
    Bucket=bucket_name,
    Key=report_key,
    Body=report['content'],
    ContentType='text/markdown'
)

print(f"\n‚úì Report saved to s3://{bucket_name}/{report_key}")

## 10. Save ML Models for Future Use

In [None]:
import joblib

# Save trained models
print("üíæ Saving ML models...\n")

model_dir = '/tmp/threat-hunting-models'
os.makedirs(model_dir, exist_ok=True)

# Save Isolation Forest model
joblib.dump(iso_forest, f'{model_dir}/user_anomaly_detector.pkl')
print("‚úì Saved user anomaly detection model")

# Upload to S3
s3_client.upload_file(
    f'{model_dir}/user_anomaly_detector.pkl',
    bucket_name.replace('notebooks', 'models'),
    'threat-hunting/user_anomaly_detector.pkl'
)

print("‚úì Models uploaded to S3")
print("\nModels can be loaded in future sessions for real-time threat detection")