<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalies/blob/main/mac_ip_mismatch_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mac-IP Mismatch Dataset

## DHCP Lease Records

In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta
import ipaddress

### Helper Functions for creating the columns of the dataset

In [2]:
def random_ip():
    return f"192.168.1.{random.randint(1, 50)}"

In [3]:
def random_mac():
    return ':'.join(f'{random.randint(0, 255):02x}' for _ in range(6))

In [4]:
def random_port():
    return random.randint(1, 65535)

In [5]:
def random_protocol():
    return random.choice(['TCP', 'UDP', 'ICMP'])

In [6]:
def random_action():
    return random.choice(['allow', 'deny', 'drop', 'reset'])

In [7]:
def random_bytes():
    return random.randint(40, 1500)

### Main Function of Dataset Generator

In [8]:
def generate_dhcp_leases(num_leases=200):
    data = []
    base_time = datetime.now() - timedelta(days=30)
    macs = [random_mac() for _ in range(50)]  # 50 unique devices
    for i in range(num_leases):
        mac = random.choice(macs)
        ip = random_ip()
        lease_start = base_time + timedelta(hours=random.randint(0, 700))
        lease_duration = timedelta(hours=random.randint(1, 72))
        lease_end = lease_start + lease_duration
        hostname = f"host-{random.randint(1000,9999)}"
        data.append({
            'MAC Address': mac,
            'IP Address': ip,
            'Lease Start': lease_start.isoformat(),
            'Lease End': lease_end.isoformat(),
            'Hostname': hostname
        })
    df = pd.DataFrame(data)
    return df

In [9]:
dhcp_df = generate_dhcp_leases(200)
dhcp_df.to_csv('dhcp_lease_records.csv', index=False)
print("DHCP lease records saved to dhcp_lease_records.csv")

DHCP lease records saved to dhcp_lease_records.csv


## Enhanced Firewall Log with MAC Address

### Helper functions for generating the columns

In [10]:
# Helper functions are same as above

In [11]:
def generate_firewall_log_with_mac(dhcp_df, num_records=1000, anomaly_ratio=0.1):
    data = []
    # Build lease lookup: {ip: [(start, end, mac), ...]}
    lease_dict = {}
    for _, row in dhcp_df.iterrows():
        ip = row['IP Address']
        mac = row['MAC Address']
        lease_start = datetime.fromisoformat(row['Lease Start'])
        lease_end = datetime.fromisoformat(row['Lease End'])
        lease_dict.setdefault(ip, []).append((lease_start, lease_end, mac))
    all_macs = list(dhcp_df['MAC Address'].unique())
    all_ips = list(dhcp_df['IP Address'].unique())
    base_time = datetime.now()
    for i in range(num_records):
        timestamp = base_time + timedelta(seconds=i * 10)
        # Pick a random IP with at least one lease
        ip = random.choice(all_ips)
        leases = lease_dict[ip]
        # Find a lease covering this timestamp, or pick any if none match
        lease = None
        for l in leases:
            if l[0] <= timestamp <= l[1]:
                lease = l
                break
        if not lease:
            lease = random.choice(leases)
        expected_mac = lease[2]
        # Decide if this record is an anomaly
        if random.random() < anomaly_ratio:
            # Anomaly: assign a MAC that is NOT the expected one
            anomaly_mac = random.choice([m for m in all_macs if m != expected_mac])
            src_mac = anomaly_mac
            is_anomaly = True
        else:
            src_mac = expected_mac
            is_anomaly = False
        # Fill in the rest of the fields
        dst_ip = random.choice(all_ips)
        dst_mac = random.choice(all_macs)
        src_port = random.randint(1024, 65535)
        dst_port = random.randint(1, 65535)
        protocol = random.choice(['TCP', 'UDP', 'ICMP'])
        action = random.choice(['allow', 'deny', 'drop', 'reset'])
        bytes_sent = random.randint(40, 1500)
        data.append({
            'Timestamp': timestamp.isoformat(),
            'Source IP': ip,
            'Destination IP': dst_ip,
            'Source Port': src_port,
            'Destination Port': dst_port,
            'Protocol': protocol,
            'Action': action,
            'Bytes': bytes_sent,
            'Source MAC': src_mac,
            'Destination MAC': dst_mac,
            'Is Anomaly': is_anomaly  # For validation/testing only
        })
    df = pd.DataFrame(data)
    return df

In [12]:
firewall_df = generate_firewall_log_with_mac(dhcp_df, num_records=1000, anomaly_ratio=0.1)
firewall_df.to_csv('enhanced_firewall_log_with_mac.csv', index=False)
print("Enhanced firewall log saved to enhanced_firewall_log_with_mac.csv")

Enhanced firewall log saved to enhanced_firewall_log_with_mac.csv
