### Step 1: Generate Synthetic Data
We'll use the same approach as before but adapt it for the LOF model.

Generate Synthetic Data: Create realistic synthetic data for normal and anomalous outbound connections.
Feature Engineering: Transform the raw data into features suitable for the LOF model.
Model Training: Train the LOF model on the synthetic data.
Real-Time Monitoring: Use the trained model to predict anomalies in real-time.
Let's start by generating the synthetic data and then proceed with the rest of the steps.

In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Define process rules for normal and anomalous behavior
process_rules_normal = {
    'chrome.exe': ['151.101.1.69', '172.217.16.195'],
    'firefox.exe': ['151.101.1.69', '172.217.16.195'],
    'svchost.exe': ['93.184.216.34'],
    'explorer.exe': ['172.217.16.195']
}

process_rules_anomalous = {
    'svchost.exe': ['198.51.100.2'],  # Legitimate process, unusual destination
    'unknown_process.exe': ['203.0.113.1'],  # Unrecognized process
}

def generate_connections(process_rules, n, anomalous=False):
    data = {'pid': [], 'process_name': [], 'src_ip': [], 'src_port': [], 'dst_ip': [],
            'dst_port': [], 'duration': [], 'bytes_sent': [], 'bytes_received': [], 'timestamp': []}

    for process_name, dst_ips in process_rules.items():
        for _ in range(n):
            data['pid'].append(np.random.randint(1000, 5000))
            data['process_name'].append(process_name)
            data['src_ip'].append('192.168.1.100')
            data['src_port'].append(np.random.randint(1024, 65535))
            data['dst_ip'].append(np.random.choice(dst_ips))
            data['dst_port'].append(np.random.randint(80, 443) if not anomalous else np.random.randint(1000, 65535))
            data['duration'].append(np.random.exponential(scale=1.0) if not anomalous else np.random.exponential(scale=10.0))
            data['bytes_sent'].append(np.random.exponential(scale=500) if not anomalous else np.random.exponential(scale=10000))
            data['bytes_received'].append(np.random.exponential(scale=500) if not anomalous else np.random.exponential(scale=10000))
            data['timestamp'].append(pd.Timestamp('2023-01-01') + pd.Timedelta(minutes=np.random.randint(0, 1440)))

    return pd.DataFrame(data)

# Generate normal connections
df_normal = generate_connections(process_rules_normal, 500)

# Generate anomalous connections with varied characteristics
df_anomalous_legit = generate_connections({'svchost.exe': ['198.51.100.2']}, 20, anomalous=True)
df_anomalous_unknown = generate_connections({'unknown_process.exe': ['203.0.113.1']}, 30, anomalous=True)

# Combine the data
df = pd.concat([df_normal, df_anomalous_legit, df_anomalous_unknown], ignore_index=True)
print(df.tail())

       pid         process_name         src_ip  src_port       dst_ip  \
2045  3142  unknown_process.exe  192.168.1.100     61881  203.0.113.1   
2046  3447  unknown_process.exe  192.168.1.100     60943  203.0.113.1   
2047  2024  unknown_process.exe  192.168.1.100      4797  203.0.113.1   
2048  1848  unknown_process.exe  192.168.1.100     63360  203.0.113.1   
2049  1622  unknown_process.exe  192.168.1.100     30822  203.0.113.1   

      dst_port   duration    bytes_sent  bytes_received           timestamp  
2045      9102   8.509564   6131.851638     3244.091242 2023-01-01 06:51:00  
2046     30713   3.306930    807.715203     1567.584315 2023-01-01 11:49:00  
2047     25956  19.410234  11854.694766    14613.909998 2023-01-01 15:53:00  
2048     21511  17.285092   7024.754290    21673.886764 2023-01-01 01:13:00  
2049     61262   7.473897  11343.226504     3371.332004 2023-01-01 22:16:00  


### Step 2: Feature Engineering
We'll preprocess the data to make it suitable for the LOF model.

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the feature columns
numeric_features = ['src_port', 'dst_port', 'duration', 'bytes_sent', 'bytes_received']
categorical_features = ['process_name', 'dst_ip']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Apply the preprocessing pipeline
X = preprocessor.fit_transform(df)

### Step 3: Model Training
Train the LOF model on the preprocessed data.

In [3]:
from sklearn.neighbors import LocalOutlierFactor

# Initialize the Local Outlier Factor model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

# Fit the model to the preprocessed data and predict
y_pred = lof.fit_predict(X)
X_scores = lof.negative_outlier_factor_

# Count the number of errors (anomalies detected)
n_errors = (y_pred == -1).sum()
print(f"Number of anomalies detected: {n_errors}")

Number of anomalies detected: 205


### Step 4: Real-Time Monitoring
Use the trained model to predict anomalies in real-time. Note that LOF does not have predict, decision_function, or score_samples methods when used for outlier detection, so we'll need to fit the model again on the combined dataset including the new data point.

In [5]:
# Example new data point representing an anomalous outbound connection
anomalous_new_data = {
    'pid': [4100],  # Random PID within normal range
    'process_name': ['unknown_process.exe'],  # Anomalous process
    'src_ip': ['192.168.1.100'],  # Source IP (system IP)
    'src_port': [5555],  # Unusual source port
    'dst_ip': ['203.0.113.1'],  # Anomalous destination IP
    'dst_port': [12345],  # Unusual destination port
    'duration': [15.0],  # Longer than usual duration
    'bytes_sent': [50000],  # Large amount of data sent
    'bytes_received': [20000],  # Large amount of data received
    'timestamp': [pd.Timestamp('2023-01-01 12:00:00')]  # Normal timestamp
}

new_df_anomalous = pd.DataFrame(anomalous_new_data)

# Combine the new data point with the existing data
df_combined = pd.concat([df, new_df_anomalous], ignore_index=True)

# Preprocess the combined data
X_combined = preprocessor.transform(df_combined)

# Fit the LOF model to the combined data and predict
y_pred_combined = lof.fit_predict(X_combined)
new_data_prediction = y_pred_combined[-1]

# Output the result
print('Anomaly' if new_data_prediction == -1 else 'Normal')

Anomaly


In [6]:
# Example new data point representing an ambiguous outbound connection
ambiguous_new_data = {
    'pid': [2500],  # Random PID within normal range
    'process_name': ['svchost.exe'],  # Known process
    'src_ip': ['192.168.1.100'],  # Source IP (system IP)
    'src_port': [8080],  # Normal source port
    'dst_ip': ['198.51.100.2'],  # Unusual destination IP for svchost.exe
    'dst_port': [80],  # Standard HTTP port
    'duration': [2.0],  # Normal duration
    'bytes_sent': [1000],  # Normal data sent
    'bytes_received': [1500],  # Normal data received
    'timestamp': [pd.Timestamp('2023-01-01 12:00:00')]  # Normal timestamp
}

new_df_ambiguous = pd.DataFrame(ambiguous_new_data)

# Combine the new data point with the existing data
df_combined = pd.concat([df, new_df_ambiguous], ignore_index=True)

# Preprocess the combined data
X_combined = preprocessor.transform(df_combined)

# Fit the LOF model to the combined data and predict
y_pred_combined = lof.fit_predict(X_combined)
new_data_prediction = y_pred_combined[-1]

# Output the result
print('Anomaly' if new_data_prediction == -1 else 'Normal')

Anomaly


In [4]:
# Example new data point representing a normal outbound connection
normal_new_data = {
    'pid': [1501],  # Random PID within normal range
    'process_name': ['chrome.exe'],  # Normal process
    'src_ip': ['192.168.1.100'],  # Source IP (system IP)
    'src_port': [5000],  # A typical source port
    'dst_ip': ['151.101.1.69'],  # A typical destination IP for chrome.exe
    'dst_port': [80],  # A standard HTTP port
    'duration': [1.5],  # Normal duration
    'bytes_sent': [300],  # Normal data sent
    'bytes_received': [400],  # Normal data received
    'timestamp': [pd.Timestamp('2023-01-01 12:00:00')]  # Normal timestamp
}

new_df_normal = pd.DataFrame(normal_new_data)

# Combine the new data point with the existing data
df_combined = pd.concat([df, new_df_normal], ignore_index=True)

# Preprocess the combined data
X_combined = preprocessor.transform(df_combined)

# Fit the LOF model to the combined data and predict
y_pred_combined = lof.fit_predict(X_combined)
new_data_prediction = y_pred_combined[-1]

# Output the result
print('Anomaly' if new_data_prediction == -1 else 'Normal')

Normal
