In [4]:
import boto3
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import os
import json
import concurrent.futures

# --- Configuration ---
CPU_THRESHOLD = 10  # %
REGIONS = ['us-west-2', 'us-east-1', 'eu-west-1', 'ap-south-1', 'eu-central-1']
os.makedirs('scenario_csvs', exist_ok=True)

# --- Scenario 1: Security Group Misconfigurations ---
def check_security_groups(region):
    try:
        ec2 = boto3.client('ec2', region_name=region)
        security_groups = ec2.describe_security_groups()['SecurityGroups']

        risky_rules = []
        for sg in security_groups:
            for perm in sg.get('IpPermissions', []):
                for ip_range in perm.get('IpRanges', []):
                    cidr = ip_range.get('CidrIp')
                    if cidr == '0.0.0.0/0':
                        port = perm.get('FromPort')
                        if port in [22, 3389, 80, 443]:
                            risky_rules.append({
                                'region': region,
                                'group_id': sg['GroupId'],
                                'port': port,
                                'protocol': perm.get('IpProtocol'),
                                'cidr': cidr,
                                'group_name': sg.get('GroupName', '')
                            })

        if risky_rules:
            df = pd.DataFrame(risky_rules)
            df.to_csv(f'scenario_csvs/security_group_misconfigs_{region}.csv', index=False)
        else:
            print(f"✅ No risky security group rules found in {region}.")
    except Exception as e:
        print(f"❌ Error in security group check for {region}: {e}")

# --- Scenario 2: EC2 Underutilization + AI Ethical Recommendation ---
def check_ec2_instances(region):
    try:
        ec2 = boto3.client('ec2', region_name=region)
        cw = boto3.client('cloudwatch', region_name=region)
        instances = ec2.describe_instances(Filters=[{'Name': 'instance-state-name', 'Values': ['running']}])

        instance_data = []
        for r in instances['Reservations']:
            for i in r['Instances']:
                instance_id = i['InstanceId']
                instance_type = i['InstanceType']
                launch_time = i['LaunchTime']
                if launch_time.tzinfo is None:
                    launch_time = launch_time.replace(tzinfo=timezone.utc)
                tags = {t['Key']: t['Value'] for t in i.get('Tags', [])}

                now = datetime.now(timezone.utc)
                running_time_hours = round((now - launch_time).total_seconds() / 3600, 2)

                metrics = cw.get_metric_statistics(
                    Namespace='AWS/EC2',
                    MetricName='CPUUtilization',
                    Dimensions=[{'Name': 'InstanceId', 'Value': instance_id}],
                    StartTime=now - timedelta(days=7),
                    EndTime=now,
                    Period=3600,
                    Statistics=['Average']
                )

                datapoints = metrics['Datapoints']
                avg_cpu = np.mean([dp['Average'] for dp in datapoints]) if datapoints else 0

                instance_data.append({
                    'region': region,
                    'instance_id': instance_id,
                    'instance_type': instance_type,
                    'launch_time': launch_time,
                    'running_time_hours': running_time_hours,
                    'avg_cpu': round(avg_cpu, 2),
                    'tag_name': tags.get('Name', 'unknown')
                })

        if instance_data:
            df = pd.DataFrame(instance_data)
            df['label'] = df['avg_cpu'].apply(lambda x: 0 if x < 10 else 1)
            le = LabelEncoder()
            df['instance_type_enc'] = le.fit_transform(df['instance_type'])
            X = df[['avg_cpu', 'instance_type_enc', 'running_time_hours']]
            y = df['label']

            if len(np.unique(y)) > 1:
                clf = RandomForestClassifier(n_estimators=100, random_state=42)
                clf.fit(X, y)
                df['prediction'] = clf.predict(X)
                df['confidence'] = clf.predict_proba(X)[:, 1].round(2)
            else:
                df['prediction'] = y
                df['confidence'] = y.astype(float)

            def ethical_recommendation(row):
                if row['prediction'] == 0:
                    if row['confidence'] > 0.85:
                        return 'suggest_termination'
                    elif row['confidence'] > 0.65:
                        return 'consider_downgrade'
                return 'keep'

            df['recommendation'] = df.apply(ethical_recommendation, axis=1)

            df.to_csv(f'scenario_csvs/predicted_unused_{region}.csv', index=False)
        else:
            print(f"✅ No EC2 instances found in {region}.")

    except Exception as e:
        print(f"❌ Error in EC2 analysis for {region}: {e}")

# --- Scenario 3: IAM Permission Overreach ---
def analyze_iam_permissions():
    try:
        iam = boto3.client('iam')
        users = iam.list_users()['Users']

        risks = []
        for user in users:
            policies = iam.list_attached_user_policies(UserName=user['UserName'])['AttachedPolicies']
            for p in policies:
                policy_arn = p['PolicyArn']
                version = iam.get_policy(PolicyArn=policy_arn)['Policy']['DefaultVersionId']
                document = iam.get_policy_version(PolicyArn=policy_arn, VersionId=version)['PolicyVersion']['Document']

                statements = document['Statement']
                if isinstance(statements, dict):
                    statements = [statements]

                for s in statements:
                    action = s.get('Action', '')
                    resource = s.get('Resource', '')
                    if (isinstance(action, str) and action == '*') or (isinstance(action, list) and '*' in action):
                        risks.append({
                            'user': user['UserName'],
                            'policy_name': p['PolicyName'],
                            'policy_arn': policy_arn,
                            'issue': 'Wildcard action (*) found'
                        })
                    if (isinstance(resource, str) and resource == '*') or (isinstance(resource, list) and '*' in resource):
                        risks.append({
                            'user': user['UserName'],
                            'policy_name': p['PolicyName'],
                            'policy_arn': policy_arn,
                            'issue': 'Wildcard resource (*) found'
                        })

        if risks:
            df = pd.DataFrame(risks)
            df.to_csv('scenario_csvs/iam_permission_risks.csv', index=False)
        else:
            print("✅ No IAM permission risks found.")
    except Exception as e:
        print(f"❌ Error while analyzing IAM permissions: {e}")

# --- Merge Scenario Files Safely ---
def safe_read_csv(filepath):
    try:
        if os.path.getsize(filepath) > 0:
            return pd.read_csv(filepath)
    except Exception as e:
        print(f"⚠️ Failed to read {filepath}: {e}")
    return None

def merge_scenario_csvs():
    sg_files = [f for f in os.listdir('scenario_csvs') if f.startswith('security_group_misconfigs_')]
    sg_dfs = [safe_read_csv(f'scenario_csvs/{f}') for f in sg_files]
    sg_dfs = [df for df in sg_dfs if df is not None and not df.empty]
    if sg_dfs:
        pd.concat(sg_dfs).to_csv('security_group_misconfigs.csv', index=False)

    ec2_files = [f for f in os.listdir('scenario_csvs') if f.startswith('predicted_unused_')]
    ec2_dfs = [safe_read_csv(f'scenario_csvs/{f}') for f in ec2_files]
    ec2_dfs = [df for df in ec2_dfs if df is not None and not df.empty]
    if ec2_dfs:
        pd.concat(ec2_dfs).to_csv('predicted_unused_instances.csv', index=False)

    iam_file = 'scenario_csvs/iam_permission_risks.csv'
    if os.path.exists(iam_file):
        df_iam = safe_read_csv(iam_file)
        if df_iam is not None and not df_iam.empty:
            df_iam.to_csv('iam_permission_risks.csv', index=False)

    print("✅ All scenario outputs are merged and saved as top-level CSVs.")

# --- Run All ---
def run_all_scenarios():
    print("🚀 Starting checks in parallel for regions...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(check_security_groups, REGIONS)
        executor.map(check_ec2_instances, REGIONS)

    analyze_iam_permissions()
    merge_scenario_csvs()
    print("🏁 All scenario checks completed.")

# --- Main Entry Point ---
if __name__ == '__main__':
    run_all_scenarios()


🚀 Starting checks in parallel for regions...
✅ No EC2 instances found in us-west-2.
✅ No risky security group rules found in eu-west-1.
✅ No risky security group rules found in us-west-2.
✅ No risky security group rules found in eu-central-1.
✅ No EC2 instances found in eu-west-1.
✅ No risky security group rules found in ap-south-1.
✅ No EC2 instances found in eu-central-1.
✅ No EC2 instances found in ap-south-1.
✅ No IAM permission risks found.
✅ All scenario outputs are merged and saved as top-level CSVs.
🏁 All scenario checks completed.
