In [6]:
import json
import csv
import re

In [None]:
input_json_file = ''
output_csv_file = ''

In [None]:
def parse_json_to_csv(input_json_file, output_csv_file):
    try:
        # Open input file
        with open(input_json_file, 'r') as json_file:
            data = json.load(json_file)

        # Open output CSV in write mode
        with open(output_csv_file, 'w', newline='') as csv_file:
            # Let's define our CSV writer and column headers
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow([
                'Timestamp', 'IP Address', 'Device Name', 'Device ID', 'Device Owner', 'Device Owner ID', 'Force Identifier',
                'Latitude', 'Longitude', 'Elevation', 
                'Hacked', 'Denial of Service Attacked', 'Delay Attacked',
                'Interception Attacked', 'Repaired', 'Packet Degradation',
                'Discovered', 'Kernel Panicked', 'Disrupted', 'Outbound Network Traffic',
                'Inbound Network Traffic', 'Exploited', 'Injection Attacked', 'Disabled',
                'Powered On', 'CPU Utilization', 'Device Removed', 'Disk Utilization',
                'EffectType', 'TargetDevice', 'InitiatingDevice', 'InitiatingName', 'EffectStatus'
            ])

            # Need helper function to normalize case 
            # Ex. Some logs have "Denial of Service Attack" while others are "denial of service attack"
            def get_normalized_value(data_dict, key, default='N/A'):
                # update to all lowercase
                normalized_data = {k.lower(): v for k, v in data_dict.items()}
                return normalized_data.get(key.lower(), default)

            # Start iterating through data
            for entry in data:
                try:
                    # Check if 'data' exists in the log
                    if 'data' in entry and 'log' in entry['data']:
                        log_data = entry['data']['log']

                        # Focus on logs with'Message Received'
                        # Extract important data and normalize case for lookups
                        if "Message Received" in log_data:
                            try:
                                parsed_log = json.loads(log_data.split("Message Received")[1].strip())

                                # General
                                timestamp = entry['data'].get('time', 'N/A')
                                ip_address = get_normalized_value(parsed_log, 'interfaceIPAddress', 'N/A')
                                name = get_normalized_value(parsed_log, 'deviceName', 'N/A')
                                device_id = get_normalized_value(parsed_log, 'deviceID', 'N/A')
                                owner_name = get_normalized_value(parsed_log, 'entityName', 'N/A')
                                owner_id = get_normalized_value(parsed_log, 'owningEntityID', 'N/A')
                                force = get_normalized_value(parsed_log, 'deviceForceIdentifier', 'N/A')

                                # Latitude, Longitude, Elevation
                                latitude = get_normalized_value(parsed_log, 'latitude', 'N/A')
                                longitude = get_normalized_value(parsed_log, 'longitude', 'N/A')
                                elevation = get_normalized_value(parsed_log, 'elevation', 'N/A')

                                # Attack Status
                                hacked = get_normalized_value(parsed_log, 'hacked', 'False')
                                dos_attacked = get_normalized_value(parsed_log, 'Denial of Service Attacked', 'False')
                                delay_attacked = get_normalized_value(parsed_log, 'Delay Attacked', 'False')
                                interception_attacked = get_normalized_value(parsed_log, 'Interception Attacked', 'False')
                                injection_attacked = get_normalized_value(parsed_log, 'Injection Attacked', 'False')

                                # Network Information
                                outbound_traffic = get_normalized_value(parsed_log, 'Outbound Network Traffic', 'N/A')
                                inbound_traffic = get_normalized_value(parsed_log, 'Inbound Network Traffic', 'N/A')
                                packet_degradation = get_normalized_value(parsed_log, 'Packet Degradation', 'False')

                                # Status Information
                                exploited = get_normalized_value(parsed_log, 'Exploited', 'False')
                                repaired = get_normalized_value(parsed_log, 'repaired', 'False')
                                discovered = get_normalized_value(parsed_log, 'Discovered', 'False')
                                kernel_panicked = get_normalized_value(parsed_log, 'Kernel Panicked', 'False')
                                disrupted = get_normalized_value(parsed_log, 'Disrupted', 'False')
                                disabled = get_normalized_value(parsed_log, 'Disabled', 'False')
                                powered_on = get_normalized_value(parsed_log, 'Powered On', 'False')

                                # Device Resources
                                cpu_utilization = get_normalized_value(parsed_log, 'CPU Utilization', 'N/A')
                                device_removed = get_normalized_value(parsed_log, 'Device Removed', 'False')
                                disk_utilization = get_normalized_value(parsed_log, 'Disk Utilization', 'N/A')

                                # Cyber Effects
                                effect_type = get_normalized_value(parsed_log, 'effect type', 'N/A')
                                target_device = get_normalized_value(parsed_log, 'targets', 'N/A')
                                initiating_device = get_normalized_value(parsed_log, 'initiating device id', 'N/A')
                                initiating_name = get_normalized_value(parsed_log, 'initiatingName', 'N/A')
                                effect_status = get_normalized_value(parsed_log, 'cyber effect status', 'N/A')

                                # Write to CSV
                                csv_writer.writerow([
                                    timestamp, ip_address, name, device_id, owner_name, owner_id, force,
                                    latitude, longitude, elevation,
                                    hacked, dos_attacked, delay_attacked, interception_attacked,
                                    repaired, packet_degradation, discovered, kernel_panicked,
                                    disrupted, outbound_traffic, inbound_traffic, exploited, injection_attacked,
                                    disabled, powered_on, cpu_utilization, device_removed, disk_utilization,
                                    effect_type, target_device, initiating_device, initiating_name, effect_status
                                ])
# -- general error handling --
                            except json.JSONDecodeError as e:
                                print(f"Skipping entry due to JSON decode error: {e}")
                        else:
                            print("Skipping entry: 'Message Received' not found in log.")
                            continue

                except Exception as e:
                    # Skip the entry if any error occurs while processing
                    print(f"Skipping invalid entry due to error: {e}")
                    continue  
    except Exception as e:
        print(f"An error occurred while processing the JSON file: {e}")

In [None]:
parse_json_to_csv(input_json_file, output_csv_file)

## Data Processing

In [None]:
## Reads a CSV file, extracts timestamps, converts them into milliseconds and ensures they fall within the range 0 - 380,436 ms by normalizing them

import pandas as pd
from datetime import datetime

def convert_timestamp_to_ms(timestamp, min_time, max_time):
    try:
        trimmed_timestamp = timestamp.split('Z')[0][:26]  # Ensure at most 6 decimal places
        dt = datetime.strptime(trimmed_timestamp, "%Y-%m-%dT%H:%M:%S.%f")
        ms_value = int(dt.timestamp() * 1000)
        return normalize_timestamp(ms_value, min_time, max_time)
    except Exception as e:
        print(f"Error parsing timestamp: {timestamp}")
        return None

def normalize_timestamp(ms_value, min_time, max_time):
    return int((ms_value - min_time) / (max_time - min_time) * 380436)

def update_csv_timestamps(input_file, output_file):
    df = pd.read_csv(input_file, dtype=str)  # Read all columns as strings
    timestamps = df.iloc[:, 0].apply(lambda x: datetime.strptime(x.split('Z')[0][:26], "%Y-%m-%dT%H:%M:%S.%f").timestamp() * 1000)
    min_time, max_time = timestamps.min(), timestamps.max()
    df.iloc[:, 0] = timestamps.apply(lambda x: normalize_timestamp(x, min_time, max_time))
    df.dropna(subset=[df.columns[0]], inplace=True)  # Remove rows with invalid timestamps
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_csv = ""
    output_csv = ""
    update_csv_timestamps(input_csv, output_csv)
    print(f"Updated CSV saved as {output_csv}")

In [None]:
# Updates 'Device Name' column to replace first two '-' in Device Name

import csv

def update_device_names(input_csv, output_csv):
    with open(input_csv, mode='r', newline='') as infile:
        reader = csv.DictReader(infile)
        
        with open(output_csv, mode='w', newline='') as outfile:
            fieldnames = reader.fieldnames
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            
            writer.writeheader()
            
            for row in reader:
                # Check if 'Device Name' column exists
                if 'Device Name' in row:
                    device_name = row['Device Name']
                    # Replace only the first 2 occurrences of '-'
                    device_name = device_name.replace('-', '/', 2)
                    row['Device Name'] = device_name
                
                writer.writerow(row)

input_csv = ""
output_csv = ""

update_device_names(input_csv, output_csv)
print(f"Updated CSV saved as {output_csv}")

In [None]:
import pandas as pd

# Calculate the delta between consecutive timestamps in milliseconds
def convert_to_time_deltas(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%dT%H:%M:%S.%fZ')

    # Calculate the time difference between each timestamp and the previous one (in seconds)
    df['Time (ms)'] = df['Timestamp'].diff().dt.total_seconds() * 1000  # Convert to milliseconds

    # Calculate the cumulative time in milliseconds (time since the start of the simulation)
    df['Cumulative Time (ms)'] = df['Time (ms)'].cumsum()  # Cumulative sum of time differences

    # The first row will have 0ms for the start of the simulation...handle the NaN (from diff on the first row)
    df['Cumulative Time (ms)'] = df['Cumulative Time (ms)'].fillna(0)

    return df

def process_csv(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    df = convert_to_time_deltas(df)
    df.to_csv(output_csv, index=False)

input_csv = ""
output_csv = ""

process_csv(input_csv, output_csv)
print(f"Updated CSV saved as {output_csv}")