In [17]:
#This cell is for parsing the data from the raw files and formatting it to a csv format in a single master file

import pandas as pd
import glob
import time
from tqdm import tqdm

# Source files of raw data
source_files = '/home/charlie/Desktop/Original_Data/RAW_Subset/*.labeled'

# Output master CSV file
output_file = '/home/charlie/Desktop/Original_Data/Master_FIles/Draft_Master.csv'

# Data types being parsed
parsed_data = {
    'ts': 'object',
    'uid': 'object',
    'id.orig_h': 'object',
    'id.orig_p': 'object',
    'id.resp_h': 'object',
    'id.resp_p': 'object',
    'proto': 'object',
    'service': 'object',
    'duration': 'object',
    'orig_bytes': 'object',
    'resp_bytes': 'object',
    'conn_state': 'object',
    'local_orig': 'object',
    'local_resp': 'object',
    'missed_bytes': 'object',
    'history': 'object',
    'orig_pkts': 'object',
    'orig_ip_bytes': 'object',
    'resp_pkts': 'object',
    'resp_ip_bytes': 'object',
    'tunnel_parents': 'object',
    'label': 'object',
    'detailed_label': 'object'
}


# List all files
files = glob.glob(source_files)

# Start time for tracking
start_time = time.time()
try:
    # Read loop and setup progress bar using tqdm
    with tqdm(total=len(files), desc='Processing Files') as pbar:
        first_file = True  # Write headers once to file
        for file in files:
            try:
                # Extract header from metadata
                with open(file, 'r') as f:
                    header = None
                    for line in f:
                        if line.startswith("#fields"):
                            header = [f"'{col.strip()}'" for col in line.strip().split('\t')[1:]]  # Encapsulate in quotes
                            break
                
                # Ensure header is found
                if header is None:
                    raise ValueError(f"No header found in file: {file}")

                # Reads files in chunks to prevent memory errors
                for chunk in pd.read_csv(file, 
                                         sep='\t', #seperator for tabs
                                         #sep='\s+',  # Use regex-based separator for whitespace
                                         na_values=['-'], 
                                         chunksize=10000, 
                                         comment='#',  # Skip lines starting with '#'
                                         header=None,  # No automatic header
                                         names=header):  # Set custom header with encapsulated names
                    # Clean column names (optional additional processing)
                    chunk.columns = chunk.columns.str.strip()
                    
                    # Write to the master CSV
                    chunk.to_csv(output_file, mode='a', header=first_file, index=False)
                    first_file = False  # Stops headers being written for the remainder of the loop
            except Exception as e:
                print(f"ERROR Processing: {file}: {e}")
            pbar.update(1)
except Exception as e:
    print(f"CRITICAL ERROR: {e}")
print("CELL COMPLETE")
print(f"OUTPUT: {output_file}")


Processing Files: 0it [00:00, ?it/s]

CELL COMPLETE
OUTPUT: /home/charlie/Desktop/Original_Data/Master_FIles/Draft_Master.csv





In [16]:
#This script cleans the formatted cvs master file. The raw data is inconsistent in its format. The raw data from devices is seperated by tabs
#But the labels they added after are seperated by white spaces. The cell above formats the seperator as tabs 
#This cell goes over the master file and seperates the label data by white spaces, otherwise the all merge into one column.

# File paths
input_file = '/home/charlie/Desktop/Original_Data/Master_FIles/Draft_Master.csv'
output_file = '/home/charlie/Desktop/Original_Data/Master_FIles/Master_Dataset.csv'

# Preprocess the file to fix spaces in the last columns
try:
    with tqdm(unit='line', desc='Processing File') as pbar:
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            for line in infile:
                # Skip comment lines
                if line.startswith("#"):
                    outfile.write(line)
                    continue
                
                # Replace spaces between `tunnel_parents`, `label`, and `detailed_label` with commas
                # We assume the problematic spaces occur as "   " (triple spaces)
                parts = line.rsplit("   ", 2)  # Split the last two occurrences of spaces
                outfile.write(','.join(parts))  # Join them with commas
            pbar.update(1)
except Exception as e:
    print(f"CRITICAL ERROR: {e}")

print("CELL COMPLETE")
print(f"OUTPUT: {output_file}")


Processing File: 1line [00:00, 384.90line/s]

CELL COMPLETE
OUTPUT: /home/charlie/Desktop/Test/master/master_dataset_cleaned.csv



