<a href="https://colab.research.google.com/github/7vckingrck/VKR/blob/test/Volkov_AK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import sqlite3
import os
import argparse

def generate_network_data(num_records=100000, start_date="2023-01-01", end_date="2025-03-28"):
    services = ["HTTP", "HTTPS", "SSH", "FTP", "DNS", "SMTP", "ICMP"]
    mac_prefixes = ["00:1A:79", "00:0D:3A", "00:24:BE"]
    ip_ranges = ["192.168.1.", "10.0.0.", "172.16.0."]
    device_types = ["Smartphone", "Laptop", "IoT", "Tablet"]

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    data = pd.DataFrame({
        "Timestamp": [start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))
                     for _ in range(num_records)],
        "WAN IP": [f"{random.choice(ip_ranges)}{random.randint(1, 254)}" for _ in range(num_records)],
        "MAC Address": [f"{random.choice(mac_prefixes)}:{random.randint(0, 255):02X}:"
                       f"{random.randint(0, 255):02X}:{random.randint(0, 255):02X}"
                       for _ in range(num_records)],
        "Service": np.random.choice(services, num_records),
        "Time Spent (s)": np.random.randint(1, 3600, num_records),
        "Traffic Volume (MB)": np.round(np.random.uniform(0.1, 500, num_records), 2),
        "Device_Type": np.random.choice(device_types, num_records),
        "Signal_Strength": np.random.randint(-90, -30, num_records)
    })

    anomaly_mask = np.random.rand(num_records) < 0.05
    data.loc[anomaly_mask, 'Time Spent (s)'] = np.random.randint(3600, 86400, sum(anomaly_mask))
    data.loc[anomaly_mask, 'Traffic Volume (MB)'] = np.random.uniform(1000, 5000, sum(anomaly_mask))
    data.loc[anomaly_mask, 'Service'] = np.random.choice(["SSH", "FTP", "ICMP"], sum(anomaly_mask))

    return data

def load_and_prepare_data(filepath):
    df = pd.read_csv(filepath)
    df = df[(df['Time Spent (s)'] > 0) & (df['Traffic Volume (MB)'] >= 0)]
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Hour_sin'] = np.sin(2*np.pi*df['Timestamp'].dt.hour/24)
    df['Hour_cos'] = np.cos(2*np.pi*df['Timestamp'].dt.hour/24)
    return df

def generate_features(df):
    df['Connections_per_MAC_1h'] = df.groupby(['MAC Address', pd.Grouper(key='Timestamp', freq='1H')])['MAC Address'].transform('count')
    df['Traffic_per_connection'] = df['Traffic Volume (MB)'] / (df['Time Spent (s)'] + 1e-6)
    df['Traffic_std'] = df.groupby('MAC Address')['Traffic Volume (MB)'].transform('std')
    df['IP_rotation_rate'] = df.groupby('MAC Address')['WAN IP'].transform('nunique') / df.groupby('MAC Address').size()
    return df.fillna(0)

class AnomalyDetector:
    def __init__(self):
        self.models = {
            'IsolationForest': make_pipeline(
                RobustScaler(),
                IsolationForest(n_estimators=500, contamination='auto', random_state=42)
            ),
            'OneClassSVM': make_pipeline(
                RobustScaler(),
                OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
            ),
            'LOF': make_pipeline(
                RobustScaler(),
                LocalOutlierFactor(n_neighbors=50, contamination=0.05, novelty=True)
            )
        }

    def fit(self, X):
        for model in self.models.values():
            model.fit(X)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models.values()])
        return (np.mean(predictions, axis=0) < 0).astype(int)

def build_network_graph(df):
    G = nx.Graph()
    for mac in df['MAC Address'].unique():
        G.add_node(mac, type='device')

    ip_groups = df.groupby('WAN IP')['MAC Address'].unique()
    for ip, macs in ip_groups.items():
        if len(macs) > 1:
            for i in range(len(macs)):
                for j in range(i+1, len(macs)):
                    if G.has_edge(macs[i], macs[j]):
                        G[macs[i]][macs[j]]['weight'] += 1
                    else:
                        G.add_edge(macs[i], macs[j], weight=1)
    return G

def plot_3d_anomalies(df, anomalies):
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')

    normal = df[~anomalies]
    anomaly = df[anomalies]

    ax.scatter(normal['Time Spent (s)'], normal['Traffic Volume (MB)'], normal['Connections_per_MAC_1h'],
               c='blue', alpha=0.3, label='Normal')
    ax.scatter(anomaly['Time Spent (s)'], anomaly['Traffic Volume (MB)'], anomaly['Connections_per_MAC_1h'],
               c='red', alpha=0.7, label='Anomaly')

    ax.set_xlabel('Time Spent (s)')
    ax.set_ylabel('Traffic (MB)')
    ax.set_zlabel('Connections/Hour')
    plt.legend()
    plt.show()

class StreamingAnomalyDetector:
    def __init__(self, window_size=1000):
        self.window = []
        self.window_size = window_size
        self.model = IsolationForest(n_estimators=100)
        self.scaler = RobustScaler()

    def process_record(self, record):
        self.window.append(record)
        if len(self.window) >= self.window_size:
            self._train_model()
            self.window = []
        return self._predict(record)

    def _train_model(self):
        X = pd.DataFrame(self.window)[features]
        self.model.fit(self.scaler.fit_transform(X))

    def _predict(self, record):
        x = self.scaler.transform([record[features]])
        return self.model.predict(x)[0]

class DatabaseManager:
    def __init__(self, db_path="network_data.db"):
        self.conn = sqlite3.connect(db_path)
        self._init_db()

    def _init_db(self):
        self.conn.execute("""
        CREATE TABLE IF NOT EXISTS connections (
            timestamp TEXT, mac TEXT, ip TEXT, service TEXT,
            duration INTEGER, traffic REAL, device_type TEXT,
            signal_strength INTEGER, is_anomaly INTEGER DEFAULT 0
        )""")
        self.conn.commit()

    def save_record(self, record):
        self.conn.execute("""
        INSERT INTO connections VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, record)
        self.conn.commit()

def main(mode='analyze'):
    if not os.path.exists("data.csv"):
        df = generate_network_data()
        df.to_csv("data.csv", index=False)

    df = load_and_prepare_data("data.csv")
    df = generate_features(df)

    features = [
        'Time Spent (s)', 'Traffic Volume (MB)', 'Connections_per_MAC_1h',
        'Traffic_per_connection', 'Traffic_std', 'IP_rotation_rate',
        'Hour_sin', 'Hour_cos'
    ]
    X = df[features]
    y = ((df['Traffic Volume (MB)'] > 1000) | (df['Connections_per_MAC_1h'] > 100)).astype(int)

    if mode == 'analyze':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        detector = AnomalyDetector()
        detector.fit(X_train)
        y_pred = detector.predict(X_test)

        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

        plot_3d_anomalies(df.iloc[X_test.index], y_test)

        G = build_network_graph(df)
        print(f"\nNetwork Graph: {len(G.nodes())} nodes, {len(G.edges())} edges")

    elif mode == 'stream':
        stream_processor = StreamingAnomalyDetector()
        db = DatabaseManager()

        for _, row in df.sample(1000).iterrows():
            record = row[features].tolist() + [row['Device_Type'], row['Signal_Strength']]
            is_anomaly = stream_processor.process_record(record)

            db.save_record((
                str(row['Timestamp']), row['MAC Address'], row['WAN IP'],
                row['Service'], row['Time Spent (s)'], row['Traffic Volume (MB)'],
                row['Device_Type'], row['Signal_Strength'], int(is_anomaly)
            ))

            if is_anomaly == -1:
                print(f"Anomaly detected: {row['MAC Address']} at {row['Timestamp']}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=['analyze', 'stream'], default='analyze')
    args = parser.parse_args()

    main(mode=args.mode)