<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Financial_Banking_IPDR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [2]:
# Timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_financial_time_pattern(start_dt, duration_days):
    """
    Generate realistic financial/banking time patterns:
    - Peak banking hours: 9am–12pm & 3pm–6pm (60% of sessions)
    - Off-peak: 12pm–3pm & 6pm–8pm (30%)
    - Odd hours (anomalous): 8pm–9am (10%)
    """
    peak1 = list(range(9, 12))
    peak2 = list(range(15, 18))
    offpeak = list(range(12, 15)) + list(range(18, 20))
    odd = list(range(0, 9)) + list(range(20, 24))

    r = random.random()
    if r < 0.6:
        hour = random.choice(peak1 + peak2)
    elif r < 0.9:
        hour = random.choice(offpeak)
    else:
        hour = random.choice(odd)

    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    naive = base.replace(hour=hour, minute=minute, second=second)
    return IST.localize(naive)

In [3]:
def generate_bank_src_ip():
    """
    Realistic Indian banking customer IP ranges:
    - Use national ISP ranges similar to broadband
    """
    ranges = ["49.{}.{}.{}", "14.{}.{}.{}", "117.{}.{}.{}", "203.{}.{}.{}", "182.{}.{}.{}"]
    template = random.choice(ranges)
    return template.format(
        random.randint(1, 254),
        random.randint(1, 254),
        random.randint(1, 254)
    )

In [4]:
def generate_bank_dest_ip():
    """
    Realistic bank server IPs from major Indian banks (HDFC, ICICI, BOI):
    """
    templates = [
        "103.120.104.{}", "103.120.105.{}", "103.120.106.{}", "103.120.107.{}",
        "203.171.210.{}", "203.171.211.{}", "203.189.92.{}", "203.27.235.{}",
        "103.109.134.{}", "103.109.135.{}", "103.183.72.{}", "103.183.73.{}"
    ]
    tpl = random.choice(templates)
    return tpl.format(random.randint(1, 254))

In [5]:
def inject_financial_anomalies(records, anomaly_ratio=0.05):
    """
    Inject realistic financial anomalies:
    - rapid-fire small transactions (bot login attempts)
    - large single transactions
    - odd-hour activity
    - rare destination IP (suspicious)
    """
    n = int(len(records) * anomaly_ratio)
    idxs = random.sample(range(len(records)), n)
    for i in idxs:
        rec = records[i]
        typ = random.choice([
            'rapid_transactions', 'large_single_tx', 'odd_hour_activity', 'suspicious_destination'
        ])
        rec['is_anomaly'] = 1
        rec['anomaly_type'] = typ

        if typ == 'rapid_transactions':
            # many tiny sessions in short time
            rec['session_duration'] = random.randint(1, 10)
            rec['uplink_volume'] = random.randint(100, 1000)
            rec['downlink_volume'] = random.randint(200, 2000)

        elif typ == 'large_single_tx':
            # one huge data volume (e.g., bulk file download)
            rec['downlink_volume'] *= 20
            rec['session_duration'] = max(rec['session_duration'], 300)

        elif typ == 'odd_hour_activity':
            # move to 2-5 AM
            t = datetime.strptime(rec['start_time'], "%Y-%m-%d %H:%M:%S")
            new = t.replace(hour=random.randint(2, 5))
            rec['start_time'] = new.strftime("%Y-%m-%d %H:%M:%S")
            rec['end_time'] = (new + timedelta(seconds=rec['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")

        elif typ == 'suspicious_destination':
            rec['dest_ip'] = f"45.{random.randint(32,63)}.{random.randint(1,254)}.{random.randint(1,254)}"
    return records

In [6]:
def generate_financial_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate synthetic Financial/Banking network IPDR dataset:
    - Only IPDR-valid columns
    - Realistic session durations (~4 min avg)
    - Realistic data volumes (small queries & statements)
    - Major bank IP ranges for destination
    - 5% anomalies injected
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    records = []
    for _ in range(num_records):
        # Session timing
        start_time = get_financial_time_pattern(start_dt, duration_days)
        # Banking apps often time out after ~4 minutes
        session_duration = random.randint(60, 600)  # 1–10 minutes
        end_time = start_time + timedelta(seconds=session_duration)

        # Data volumes: small query vs statement download
        uplink = random.randint(500, 20000)        # 0.5KB–20KB
        downlink = random.randint(1000, 50000)     # 1KB–50KB

        rec = {
            # IPDR fields
            "src_ip": generate_bank_src_ip(),
            "src_port": random.randint(1024, 65535),
            "dest_ip": generate_bank_dest_ip(),
            "dest_port": random.choice([443, 8443]),   # HTTPS/secure
            "protocol": "TCP",
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
            "session_duration": session_duration,
            "uplink_volume": uplink,
            "downlink_volume": downlink,
            "total_volume": uplink + downlink,
            # Labels for testing
            "is_anomaly": 0,
            "anomaly_type": "normal"
        }
        records.append(rec)

    # Inject anomalies
    records = inject_financial_anomalies(records, anomaly_ratio=0.05)

    # Build DataFrame and add derived features
    df = pd.DataFrame(records)
    df['bytes_per_second'] = df['total_volume'] / df['session_duration']
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['is_peak_hour'] = df['hour'].between(9, 11) | df['hour'].between(15, 17)
    return df

if __name__ == "__main__":
    df_fin = generate_financial_ipdr_dataset(num_records=10000)
    print("Financial/Banking IPDR Dataset")
    print("Total records:", len(df_fin))
    print("Anomalies:", df_fin['is_anomaly'].sum(), f"({df_fin['is_anomaly'].mean()*100:.1f}%)")
    print("Columns:", df_fin.columns.tolist())
    print(df_fin.head())
    df_fin.to_csv("financial_banking_ipdr_dataset.csv", index=False)

Financial/Banking IPDR Dataset
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'hour', 'is_peak_hour']
           src_ip  src_port          dest_ip  dest_port protocol  \
0  49.174.190.229     36765  103.120.105.152       8443      TCP   
1  182.108.57.115     39642  203.171.210.208        443      TCP   
2    117.27.24.98      7362  203.171.211.217       8443      TCP   
3  182.76.213.161     41559  203.171.211.148        443      TCP   
4    49.98.72.117     42684   203.171.211.42       8443      TCP   

            start_time             end_time  session_duration  uplink_volume  \
0  2024-01-24 12:17:15  2024-01-24 12:22:03               288           5072   
1  2024-01-07 09:14:32  2024-01-07 09:15:59                87          18890   
2  2024-01-23 13:27:21  2024-01-23 13