In [20]:
import re
import pandas as pd

data = pd.read_csv("./demo_dataset.csv")

# Check for invalid IP addresses
def is_valid_ip(ip):
    pattern = re.compile(r'^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
    return bool(pattern.match(ip))

invalid_ips = data[~data['source_ip'].astype(str).apply(is_valid_ip)]

# Check for invalid port numbers
def is_valid_port(port):
    try:
        port = int(port)
        return 0 <= port <= 65535
    except ValueError:
        return False

invalid_ports = data[~data['destination_port'].apply(is_valid_port)]

# Check for invalid protocol values
valid_protocols = ['TCP', 'TLS', 'SSH', 'POP3', 'DNS', 'HTTPS', 'SMTP', 'FTP', 'UDP', 'HTTP']
invalid_protocols = data[~data['protocol'].isin(valid_protocols)]

# Check for invalid bytes transferred
def is_valid_bytes(bytes):
    try:
        bytes = int(bytes)
        return bytes >= 0
    except ValueError:
        return False

invalid_bytes = data[~data['bytes_transferred'].apply(is_valid_bytes)]

# Check for invalid threat levels
def is_valid_threat_level(threat_level):
    try:
        threat_level = int(threat_level)
        return 0 <= threat_level <= 2
    except ValueError:
        return False

invalid_threat_levels = data[~data['threat_level'].apply(is_valid_threat_level)]

data = data.drop(invalid_ips.index, errors='ignore') 
data = data.drop(invalid_ports.index, errors='ignore')
data = data.drop(invalid_protocols.index, errors='ignore')
data = data.drop(invalid_bytes.index, errors='ignore')
data = data.drop(invalid_threat_levels.index, errors='ignore')

print(data.describe(include='all'))

            log_id     source_ip destination_port protocol bytes_transferred  \
count    77.000000            77               77       77                77   
unique         NaN            68                6        9                73   
top            NaN  192.168.1.55               80     HTTP              1024   
freq           NaN             3               22       22                 4   
mean     46.519481           NaN              NaN      NaN               NaN   
std      28.591317           NaN              NaN      NaN               NaN   
min       1.000000           NaN              NaN      NaN               NaN   
25%      22.000000           NaN              NaN      NaN               NaN   
50%      45.000000           NaN              NaN      NaN               NaN   
75%      70.000000           NaN              NaN      NaN               NaN   
max     100.000000           NaN              NaN      NaN               NaN   

       threat_level  
count            