<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalies/blob/palak/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import random

np.random.seed(42)

num_devices = 100
num_days = 30
hours_per_day = 24
records = []
device_types = ['corporate', 'student', 'family']
protocols = ['TCP', 'UDP', 'ICMP', 'unknown']
actions = ['allow', 'block', 'unknown']
applications = ['web', 'email', 'vpn', 'unknown']
countries = ['US', 'IN', 'CN', 'DE', 'unknown']
threat_types = ['malware', 'phishing', 'none', 'unknown']
rule_names = ['rule1', 'rule2', 'rule3', 'unknown']

for device_id in range(num_devices):
    device_type = np.random.choice(device_types)
    is_dormant = np.random.rand() < 0.15  # 15% dormant spike devices
    spike_day = np.random.randint(0, num_days) if is_dormant else -1
    spike_hour = np.random.randint(0, hours_per_day) if is_dormant else -1
    for day in range(num_days):
        for hour in range(hours_per_day):
            timestamp = pd.Timestamp('2024-01-01') + pd.Timedelta(days=day, hours=hour)
            if device_type == 'corporate':
                base = 10 if 8 <= hour <= 18 else 2
            elif device_type == 'student':
                base = 7 if 9 <= hour <= 15 else 1
            else:
                base = 6 if 18 <= hour <= 22 else 1
            if is_dormant and day == spike_day and hour == spike_hour:
                activity = np.random.randint(50, 100)
                label = 1  # Spike
            elif is_dormant:
                activity = 0
                label = 0
            else:
                activity = np.random.poisson(base)
                label = 0
            record = {
                'timestamp': timestamp,
                'src_ip': f"10.0.{device_id%10}.{random.randint(1,254)}",
                'dst_ip': f"192.168.{random.randint(0,10)}.{random.randint(1,254)}",
                'src_port': random.choice([80, 443, 22, 8080, 53, np.nan]),
                'dst_port': random.choice([80, 443, 22, 8080, 53, np.nan]),
                'protocol': random.choice(protocols),
                'action': random.choice(actions),
                'application': random.choice(applications),
                'bytes_sent': np.random.randint(0, 10000) if activity > 0 else 0,
                'bytes_received': np.random.randint(0, 10000) if activity > 0 else 0,
                'duration': np.random.randint(1, 3600) if activity > 0 else 0,
                'user_id': f"user_{random.randint(1, 500)}",
                'device_id': f"device_{device_id}",
                'country': random.choice(countries),
                'session_id': f"sess_{random.randint(1, 10000)}",
                'threat_type': random.choice(threat_types),
                'rule_name': random.choice(rule_names),
                'activity_count': activity,
                'device_type': device_type,
                'label': label  # 1 = spike, 0 = normal
            }
            # Randomly set some fields to unknown/missing
            for field in ['protocol', 'action', 'application', 'country', 'threat_type', 'rule_name']:
                if np.random.rand() < 0.05:
                    record[field] = 'unknown'
            for field in ['src_port', 'dst_port']:
                if np.random.rand() < 0.05:
                    record[field] = np.nan
            records.append(record)

df = pd.DataFrame(records)
df.to_csv('synthetic_firewall_log.csv', index=False)
print(f"Dataset shape: {df.shape}")
df.head()


Dataset shape: (72000, 20)


Unnamed: 0,timestamp,src_ip,dst_ip,src_port,dst_port,protocol,action,application,bytes_sent,bytes_received,duration,user_id,device_id,country,session_id,threat_type,rule_name,activity_count,device_type,label
0,2024-01-01 00:00:00,10.0.0.26,192.168.8.235,,22.0,unknown,allow,unknown,0,0,0,user_462,device_0,DE,sess_2248,phishing,unknown,0,family,0
1,2024-01-01 01:00:00,10.0.0.85,192.168.0.49,,,TCP,block,unknown,0,0,0,user_132,device_0,US,sess_7205,phishing,unknown,0,family,0
2,2024-01-01 02:00:00,10.0.0.148,192.168.1.142,8080.0,,ICMP,allow,unknown,2047,2747,976,user_307,device_0,DE,sess_4364,none,rule1,1,family,0
3,2024-01-01 03:00:00,10.0.0.184,192.168.1.227,,53.0,unknown,allow,unknown,8792,8433,2042,user_412,device_0,CN,sess_1752,malware,rule2,1,family,0
4,2024-01-01 04:00:00,10.0.0.116,192.168.8.200,53.0,8080.0,TCP,allow,web,0,0,0,user_39,device_0,IN,sess_2511,malware,unknown,0,family,0


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load data
df = pd.read_csv('synthetic_firewall_log.csv', parse_dates=['timestamp'])

# Fill missing numeric fields
for col in ['src_port', 'dst_port']:
    df[col] = df[col].fillna(-1)

# Categorical fields: fill unknowns
cat_fields = ['protocol', 'action', 'application', 'country', 'threat_type', 'rule_name', 'device_type']
for col in cat_fields:
    df[col] = df[col].fillna('unknown')
    df[col] = df[col].astype(str)

# Encode categorical variables
encoders = {}
for col in cat_fields:
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col])
    encoders[col] = enc

# Sort and aggregate per device per hour
df['hour'] = df['timestamp'].dt.floor('H')
agg_cols = ['bytes_sent', 'bytes_received', 'duration', 'activity_count']
group_cols = ['device_id', 'hour']
df_agg = df.groupby(group_cols)[agg_cols + cat_fields].mean().reset_index()

# Scale numerical features
scaler = MinMaxScaler()
df_agg[agg_cols] = scaler.fit_transform(df_agg[agg_cols])

df_agg.head()


  df['hour'] = df['timestamp'].dt.floor('H')


Unnamed: 0,device_id,hour,bytes_sent,bytes_received,duration,activity_count,protocol,action,application,country,threat_type,rule_name,device_type
0,device_0,2024-01-01 00:00:00,0.0,0.0,0.0,0.0,3.0,0.0,1.0,1.0,2.0,3.0,1.0
1,device_0,2024-01-01 01:00:00,0.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,1.0
2,device_0,2024-01-01 02:00:00,0.20472,0.274727,0.271186,0.010753,0.0,0.0,1.0,1.0,1.0,0.0,1.0
3,device_0,2024-01-01 03:00:00,0.879288,0.843384,0.56738,0.010753,3.0,0.0,1.0,0.0,0.0,1.0,1.0
4,device_0,2024-01-01 04:00:00,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,0.0,3.0,1.0


In [3]:
import numpy as np

def create_sequences(df, seq_length=24):
    X, y = [], []
    devices = df['device_id'].unique()
    feature_cols = [col for col in df.columns if col not in ['device_id', 'hour']]
    for device in devices:
        device_df = df[df['device_id'] == device].sort_values('hour')
        vals = device_df[feature_cols].values
        for i in range(len(vals) - seq_length):
            X.append(vals[i:i+seq_length])
            # For unsupervised, y can be the next activity_count or None
            y.append(vals[i+seq_length][feature_cols.index('activity_count')])
    return np.array(X), np.array(y)

X, y = create_sequences(df_agg, seq_length=24)
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")


Shape of X: (69600, 24, 11), Shape of y: (69600,)


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

seq_length = 24
n_features = X.shape[2]

model = Sequential([
    LSTM(64, input_shape=(seq_length, n_features), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# Train only on normal samples (label==0)
normal_idx = (y < 30)  # or use df['label'] if available
X_train, y_train = X[normal_idx], y[normal_idx]

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)


  super().__init__(**kwargs)


Epoch 1/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 23ms/step - loss: 0.0020 - val_loss: 9.9768e-04
Epoch 2/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 23ms/step - loss: 0.0011 - val_loss: 9.4499e-04
Epoch 3/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 22ms/step - loss: 9.3637e-04 - val_loss: 7.5943e-04
Epoch 4/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - loss: 7.8018e-04 - val_loss: 6.2659e-04
Epoch 5/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - loss: 6.9001e-04 - val_loss: 6.1727e-04
Epoch 6/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - loss: 5.9298e-04 - val_loss: 5.2444e-04
Epoch 7/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 21ms/step - loss: 6.6110e-04 - val_loss: 5.1498e-04
Epoch 8/10
[1m979/979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22ms/step - loss: 5.

In [5]:
# Predict on all data
y_pred = model.predict(X)
errors = np.abs(y_pred.flatten() - y)

# Set threshold as mean + 3*std of errors on normal data
threshold = errors[normal_idx].mean() + 3 * errors[normal_idx].std()
anomalies = errors > threshold

print(f"Anomalies detected: {anomalies.sum()} out of {len(anomalies)}")


[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step
Anomalies detected: 671 out of 69600


In [7]:
import numpy as np
import pandas as pd
import random

# Settings
num_devices = 120
num_days = 40
hours_per_day = 24
device_types = ['corporate', 'student', 'family']
protocols = ['TCP', 'UDP', 'ICMP', 'GRE', 'ESP', 'unknown']
actions = ['allow', 'block', 'reset', 'unknown']
applications = ['web', 'email', 'vpn', 'ssh', 'rdp', 'game', 'unknown']
countries = ['US', 'IN', 'CN', 'DE', 'RU', 'BR', 'unknown']
threat_types = ['malware', 'phishing', 'none', 'ransomware', 'scan', 'unknown']
rule_names = ['rule1', 'rule2', 'rule3', 'rule4', 'rule5', 'unknown']
os_versions = ['Windows10', 'Ubuntu20.04', 'macOS13', 'Android12', 'iOS16', 'unknown']
vpn_statuses = ['active', 'inactive', 'unknown']

def random_ip():
    return f"{random.randint(1, 254)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"

records = []
for device_id in range(num_devices):
    device_type = np.random.choice(device_types)
    is_dormant = np.random.rand() < 0.18  # 18% dormant spike devices
    spike_day = np.random.randint(0, num_days) if is_dormant else -1
    spike_hour = np.random.randint(0, hours_per_day) if is_dormant else -1
    for day in range(num_days):
        for hour in range(hours_per_day):
            timestamp = pd.Timestamp('2024-01-01') + pd.Timedelta(days=day, hours=hour)
            # Activity patterns
            if device_type == 'corporate':
                base = 12 if 8 <= hour <= 18 else 2
            elif device_type == 'student':
                base = 8 if 9 <= hour <= 15 else 2
            else:  # family
                base = 7 if 18 <= hour <= 23 else 1
            # Dormant device spike
            if is_dormant and day == spike_day and hour == spike_hour:
                activity = np.random.randint(60, 120)
                label = 1  # Spike
            elif is_dormant:
                activity = 0
                label = 0
            else:
                activity = np.random.poisson(base)
                label = 0
            record = {
                'timestamp': timestamp,
                'src_ip': random_ip(),
                'dst_ip': random_ip(),
                'src_port': random.choice([80, 443, 22, 8080, 53, 3389, 123, np.nan]),
                'dst_port': random.choice([80, 443, 22, 8080, 53, 3389, 123, np.nan]),
                'protocol': random.choice(protocols),
                'action': random.choice(actions),
                'application': random.choice(applications),
                'bytes_sent': np.random.randint(0, 50000) if activity > 0 else 0,
                'bytes_received': np.random.randint(0, 50000) if activity > 0 else 0,
                'duration': np.random.randint(1, 7200) if activity > 0 else 0,
                'user_id': f"user_{random.randint(1, 1000)}",
                'device_id': f"device_{device_id}",
                'country': random.choice(countries),
                'session_id': f"sess_{random.randint(1, 20000)}",
                'threat_type': random.choice(threat_types),
                'rule_name': random.choice(rule_names),
                # Extra fields for complexity
                'device_type': device_type,
                'label': label,
                'os_version': random.choice(os_versions),
                'vpn_status': random.choice(vpn_statuses),
            }
            # Inject unknown/missing values
            for field in ['protocol', 'action', 'application', 'country', 'threat_type', 'rule_name', 'os_version', 'vpn_status']:
                if np.random.rand() < 0.07:
                    record[field] = 'unknown'
            for field in ['src_port', 'dst_port']:
                if np.random.rand() < 0.07:
                    record[field] = np.nan
            records.append(record)

df_complex = pd.DataFrame(records)
df_complex.to_csv('complex_synthetic_firewall_log.csv', index=False)
print(f"Complex synthetic firewall log shape: {df_complex.shape}")
print(df_complex.head())


Complex synthetic firewall log shape: (115200, 21)
            timestamp           src_ip          dst_ip  src_port  dst_port  \
0 2024-01-01 00:00:00     162.0.223.57   95.196.34.117    8080.0      22.0   
1 2024-01-01 01:00:00  252.226.127.239    153.54.18.65      22.0      22.0   
2 2024-01-01 02:00:00   146.23.210.233     58.105.2.20      22.0       NaN   
3 2024-01-01 03:00:00   208.96.135.124  38.138.215.164     123.0    3389.0   
4 2024-01-01 04:00:00     12.133.48.74   49.19.225.183     443.0     123.0   

  protocol   action application  bytes_sent  bytes_received  ...   user_id  \
0  unknown    reset         rdp       44133           39902  ...  user_712   
1     ICMP    block         rdp       46669           23626  ...  user_767   
2      UDP    reset         web        8644           20768  ...  user_246   
3      UDP    allow        game        8498           23824  ...  user_966   
4     ICMP  unknown       email       27251           11703  ...  user_734   

  device_id

In [8]:
import pandas as pd

# Load your test dataset (replace with your actual file if needed)
df_test = pd.read_csv('complex_synthetic_firewall_log.csv', parse_dates=['timestamp'])

# If you want to split your existing dataset:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_test, test_size=0.2, random_state=42)


In [9]:
# Fill missing numeric fields
for col in ['src_port', 'dst_port']:
    df_test[col] = df_test[col].fillna(-1)

# Categorical fields: fill unknowns
cat_fields = ['protocol', 'action', 'application', 'country', 'threat_type', 'rule_name', 'device_type']
for col in cat_fields:
    df_test[col] = df_test[col].fillna('unknown').astype(str)

# Use encoders from training
for col in cat_fields:
    # Map unseen categories to 'unknown'
    known_classes = set(encoders[col].classes_)
    df_test[col] = df_test[col].apply(lambda x: x if x in known_classes else 'unknown')
    df_test[col] = encoders[col].transform(df_test[col])

# Aggregate per device per hour
df_test['hour'] = df_test['timestamp'].dt.floor('H')
agg_cols = ['bytes_sent', 'bytes_received', 'duration', 'activity_count']
group_cols = ['device_id', 'hour']
df_test_agg = df_test.groupby(group_cols)[agg_cols + cat_fields].mean().reset_index()

# Scale numerical features using the scaler from training
df_test_agg[agg_cols] = scaler.transform(df_test_agg[agg_cols])


  df_test['hour'] = df_test['timestamp'].dt.floor('H')


KeyError: "Columns not found: 'activity_count'"

In [10]:
print(df_test.columns)


Index(['timestamp', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol',
       'action', 'application', 'bytes_sent', 'bytes_received', 'duration',
       'user_id', 'device_id', 'country', 'session_id', 'threat_type',
       'rule_name', 'device_type', 'label', 'os_version', 'vpn_status',
       'hour'],
      dtype='object')


In [11]:
print(df.columns)

Index(['timestamp', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol',
       'action', 'application', 'bytes_sent', 'bytes_received', 'duration',
       'user_id', 'device_id', 'country', 'session_id', 'threat_type',
       'rule_name', 'activity_count', 'device_type', 'label', 'hour'],
      dtype='object')
