In [1]:
import pandas as pd
import gzip
import glob
import os
import zipfile
import json
from pathlib import Path
import pandas as pd
import io
from tqdm import tqdm
import pytz

log_dir = '../dataset/raw_zeek_logs/'
output_dir = '../dataset/processed_logs/'  

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)




In [2]:
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/broker.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/dns.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/ecat_arp_info.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/enip.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/weird.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/notice.00:00:00-01:00:00.log.gz | less


In [3]:
log_types_dict = {
    'broker': {
        'columns': ['ts', 'ty', 'message', 'peer.address', 'peer.bound_port'],
        'rename': {}
    },
    'conn': {
        'columns': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'duration', 'orig_bytes', 'resp_bytes',"conn_state", "local_orig"],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
    'dns': {
        'columns': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'query', 'qclass_name', 'qtype_name', 'rcode_name'],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
    'ecat_arp_info': {
        'columns': ['ts', 'mac_src', 'mac_dst', 'SAP', 'TPA', 'arp_type'],
        'rename': {'SPA':'src_ip','TPA':'dst_ip'}
    },
    'enip': {
         'columns': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p'],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
   
    'weird': {
        'columns': ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'name', 'addl', 'notice'],
        'rename': {'id.orig_h': 'src_ip', 'id.resp_h': 'dst_ip'}
    },
    'dhcp': {
        'columns': ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'client_addr', 'server_addr', 'mac'],
        'rename': {'id.orig_h': 'src_ip', 'id.resp_h': 'dst_ip', 'client_addr': 'client_ip', 'server_addr': 'server_ip'}
    }
}

In [4]:
def read_log(file_path, log_type):
    with gzip.open(file_path, 'rt') as f:
        # Skip Zeek log headers (starting with #)
        lines = [line for line in f if not line.startswith('#')]
        
    data = [line.strip().split('\t') for line in lines]
    
    data = [json.loads(item)for sublist in data for item in sublist]
    
    columns = log_types_dict[log_type]["columns"]
    
    df = pd.DataFrame(data, columns=columns)
    
    return df

### Read the log files and convert them to csv

In [8]:
data_types = ["conn","dns","ecat_arp_info","enip","weird","dhcp"]

csv_path = os.path.join(output_dir,"csvs")
os.makedirs(csv_path, exist_ok=True)


for data_type in data_types:

    pattern = f"{data_type}.*.log.gz"
    for root, dirs, files in os.walk(log_dir):
        all_df = []

        if dirs:
            for subdir in dirs:
                output_path = os.path.join(output_dir,"csvs",subdir)
                if not os.path.exists(output_path):
                    os.makedirs(output_path)
                    
                subdir_path = os.path.join(log_dir,subdir)
                files = glob.glob(os.path.join(subdir_path, pattern))
                for file in tqdm(files):
                    df = read_log(file,data_type)
                    all_df.append(df)
                    df_day = pd.concat(all_df, axis=0, ignore_index=True)
                    
                    
                file_name = pattern.split(".")[0] + ".csv"
                file_path = os.path.join(output_path,file_name)
                
                
                df_day = df_day.rename(columns=log_types_dict[data_type]["rename"])
                    
                df_day.to_csv(file_path, index=False)



  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:00<00:00, 126.10it/s]
100%|██████████| 24/24 [00:00<00:00, 108.04it/s]
100%|██████████| 24/24 [00:00<00:00, 45.64it/s]
100%|██████████| 24/24 [00:00<00:00, 53.54it/s]
100%|██████████| 25/25 [00:00<00:00, 32.08it/s]
100%|██████████| 24/24 [00:00<00:00, 29.81it/s]
100%|██████████| 24/24 [00:01<00:00, 23.00it/s]
100%|██████████| 24/24 [00:00<00:00, 187.62it/s]
100%|██████████| 24/24 [00:00<00:00, 113.08it/s]
100%|██████████| 24/24 [00:01<00:00, 17.62it/s]
100%|██████████| 24/24 [00:01<00:00, 13.90it/s]
100%|██████████| 25/25 [00:01<00:00, 13.50it/s]
100%|██████████| 24/24 [00:01<00:00, 12.38it/s]
100%|██████████| 24/24 [00:02<00:00, 11.31it/s]
100%|██████████| 24/24 [00:00<00:00, 271.85it/s]
100%|██████████| 24/24 [00:00<00:00, 258.88it/s]
100%|██████████| 24/24 [00:02<00:00,  9.58it/s]
100%|██████████| 24/24 [00:05<00:00,  4.79it/s]
100%|██████████| 24/24 [00:03<00:00,  6.18it/s]
100%|██████████| 24/24 [00:04<00:00,  5.67it/s]
100%|██████████| 24/24 [00:05<00:0

### Create Benign and Malacious Data Files

In [14]:

benign_dates = ['2024-09-01', '2024-09-02', '2024-09-03']
columns_to_keep = ['ts', 'src_ip', 'dst_ip']

benign_df = pd.DataFrame(columns=columns_to_keep)
malicious_df = pd.DataFrame(columns=columns_to_keep)

for date_folder in os.listdir(csv_path):
    date_folder_path = os.path.join(csv_path, date_folder)
    
    if os.path.isdir(date_folder_path) and date_folder.startswith('2024-09'):
        for file_name in os.listdir(date_folder_path):
            file_path = os.path.join(date_folder_path, file_name)
            
            if file_name.endswith('.csv'):
                try:
                    df = pd.read_csv(file_path, usecols=columns_to_keep)
                except ValueError:
                    continue
                
                # Append to benign or malicious DataFrame based on the date
                if date_folder in benign_dates:
                    benign_df = pd.concat([benign_df, df], ignore_index=True)
                else:
                    malicious_df = pd.concat([malicious_df, df], ignore_index=True)

In [15]:
benign_df.dropna(inplace=True)
malicious_df.dropna(inplace=True)

In [16]:
benign_df.drop_duplicates(inplace=True)
malicious_df.drop_duplicates(inplace=True)

In [17]:
benign_df.head()

Unnamed: 0,ts,src_ip,dst_ip
0,1725347000.0,192.168.57.9,224.0.0.251
1,1725347000.0,fe80::33b2:2f9:4983:bf8e,ff02::fb
2,1725347000.0,192.168.57.9,224.0.0.251
3,1725347000.0,fe80::33b2:2f9:4983:bf8e,ff02::fb
4,1725347000.0,192.168.57.9,224.0.0.251


In [18]:
benign_df = benign_df.sort_values(by='ts')
malicious_df = malicious_df.sort_values(by='ts')

In [19]:
# Define MST timezone
mst = pytz.timezone('US/Mountain')

# Convert 'ts' column from Unix timestamp to MST date
benign_df['ts'] = pd.to_datetime(benign_df['ts'], unit='s').dt.tz_localize('UTC').dt.tz_convert(mst)


In [20]:
malicious_df['ts'] = pd.to_datetime(malicious_df['ts'], unit='s').dt.tz_localize('UTC').dt.tz_convert(mst)
malicious_df

Unnamed: 0,ts,src_ip,dst_ip
87290,2024-08-31 17:59:46.462265856-06:00,172.28.1.7,172.28.0.1
87293,2024-08-31 18:00:55.787246848-06:00,192.168.57.10,192.168.255.255
87294,2024-08-31 18:01:26.501101056-06:00,fe80::74ba:e5ff:fe79:a4cd,ff02::2
87291,2024-08-31 18:01:35.797463808-06:00,192.168.57.9,224.0.0.251
87292,2024-08-31 18:01:35.799544064-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
...,...,...,...
963869,2024-09-07 17:59:33.521572096-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
963870,2024-09-07 17:59:33.787298048-06:00,192.168.57.9,224.0.0.251
963871,2024-09-07 17:59:33.787736832-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
963872,2024-09-07 17:59:33.788310016-06:00,192.168.57.9,224.0.0.251


In [21]:
# Define the cutoff date
cutoff_date = pd.Timestamp('2024-09-04', tz=mst)

# Filter the DataFrames
malicious_df = malicious_df[malicious_df['ts'] >= cutoff_date]

In [22]:
malicious_df

Unnamed: 0,ts,src_ip,dst_ip
324269,2024-09-04 00:00:21.241051904-06:00,192.168.57.10,192.168.255.255
324267,2024-09-04 00:00:29.579170816-06:00,192.168.57.9,224.0.0.251
324268,2024-09-04 00:00:29.581031168-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
310360,2024-09-04 00:00:29.581422080-06:00,192.168.57.9,224.0.0.251
310362,2024-09-04 00:00:29.581625088-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
...,...,...,...
963869,2024-09-07 17:59:33.521572096-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
963870,2024-09-07 17:59:33.787298048-06:00,192.168.57.9,224.0.0.251
963871,2024-09-07 17:59:33.787736832-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
963872,2024-09-07 17:59:33.788310016-06:00,192.168.57.9,224.0.0.251


In [23]:
# Save the DataFrames to CSV

combined_path = os.path.join(output_dir,"combined")
os.makedirs(combined_path, exist_ok=True)

benign_df.to_csv(f'{combined_path}/benign.csv', index=False)
malicious_df.to_csv(f'{combined_path}/malicious.csv', index=False)