In [1]:
import pandas as pd
import gzip
import glob
import os
import zipfile
import json
from pathlib import Path
import pandas as pd
import io
from tqdm import tqdm
import pytz

# Specify the main directory for your logs (replace '/path/to/nsm/folder' with your actual path)
log_dir = '../dataset/raw_zeek_logs/'
output_dir = '../dataset/processed_logs/csvs/'  # Directory to save the daily CSV files
output_dir_all = '../dataset/processed_logs/combined/'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_dir_all, exist_ok=True)





#### Check logs 

In [2]:
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/broker.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/dns.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/ecat_arp_info.00:00:00-01:00:00.log.gz | less
# # !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/enip.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/weird.00:00:00-01:00:00.log.gz | less
# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/notice.00:00:00-01:00:00.log.gz | less
# # !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-01/conn.00:00:00-01:00:00.log.gz | less

# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-04/capture_loss.20:00:00-21:00:00.log.gz | less

# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-04/conn-summary.19:00:00-20:00:00.log.gz | less

# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-04/stats.20:00:00-21:00:00.log.gz | less

# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-04/known_services.19:06:34-20:00:00.log.gz | less




# !zcat /home/bishal/projects/pentest_anomaly/sept_week_1/2024-09-04/dhcp.19:00:00-20:00:00.log.gz | less





#### Extract log info

In [3]:
log_types_dict = {
    'broker': {
        'columns': ['ts', 'ty', 'message', 'peer.address', 'peer.bound_port'],
        'rename': {}
    },
    'conn': {
        'columns':["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "conn_state", "local_orig", "local_resp", "missed_bytes", "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "orig_mac_oui", "community_id"],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
    'dns': {
        'columns': ["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "trans_id", "query", "qclass", "qclass_name", "qtype", "qtype_name", "rcode", "rcode_name", "AA", "TC", "RD", "RA", "Z", "answers", "TTLs", "rejected"],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
    'ecat_arp_info': {
        'columns': ['ts', 'mac_src', 'mac_dst', 'SPA', 'TPA', 'arp_type'],
        'rename': {'SPA':'src_ip','TPA':'dst_ip'}
    },
    'enip': {
         'columns': ["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "is_orig", "enip_command_code", "enip_command", "length", "session_handle", "enip_status", "sender_context", "options"],
        'rename': {'id.orig_h': 'src_ip', 'id.orig_p': 'src_port', 'id.resp_h': 'dst_ip', 'id.resp_p': 'dst_port'}
    },
   
    'weird': {
        'columns': ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'name', 'addl', 'notice'],
        'rename': {'id.orig_h': 'src_ip', 'id.resp_h': 'dst_ip'}
    },
    'dhcp': {
        'columns': ["ts", "uids", "mac", "host_name", "requested_addr", "msg_types", "duration", "orig_mac_oui"],
        'rename': {'id.orig_h': 'src_ip', 'id.resp_h': 'dst_ip', 'client_addr': 'client_ip', 'server_addr': 'server_ip'}
    }
}

In [4]:
def read_log(file_path, log_type):
    with gzip.open(file_path, 'rt') as f:
        # Skip Zeek log headers (starting with #)
        lines = [line for line in f if not line.startswith('#')]
        
    data = [line.strip().split('\t') for line in lines]
    
    data = [json.loads(item)for sublist in data for item in sublist]
    
    columns = log_types_dict[log_type]["columns"]
    
    df = pd.DataFrame(data, columns=columns)
    
    return df

### Read the log files and convert them to csv

In [5]:
data_types = ["conn","dns","ecat_arp_info","enip","weird","dhcp"]

for data_type in data_types:

    pattern = f"{data_type}.*.log.gz"
    for root, dirs, files in os.walk(log_dir):
        all_df = []

        if dirs:
            for subdir in dirs:
                output_path = os.path.join(output_dir,subdir)
                if not os.path.exists(output_path):
                    os.makedirs(output_path)
                    
                subdir_path = os.path.join(log_dir,subdir)
                files = glob.glob(os.path.join(subdir_path, pattern))
                for file in tqdm(files):
                    df = read_log(file,data_type)
                    all_df.append(df)
                    df_day = pd.concat(all_df, axis=0, ignore_index=True)
                    file_name = pattern.split(".")[0] + ".csv"
                    file_path = os.path.join(output_path,file_name)
                
                
                df_day = df_day.rename(columns=log_types_dict[data_type]["rename"])
                    
                df_day.to_csv(file_path, index=False)



  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:00<00:00, 167.82it/s]
100%|██████████| 24/24 [00:00<00:00, 80.01it/s]
100%|██████████| 24/24 [00:00<00:00, 55.18it/s]
100%|██████████| 24/24 [00:00<00:00, 41.06it/s]
100%|██████████| 25/25 [00:00<00:00, 30.77it/s]
100%|██████████| 24/24 [00:00<00:00, 24.17it/s]
100%|██████████| 24/24 [00:01<00:00, 20.13it/s]
100%|██████████| 24/24 [00:00<00:00, 105.48it/s]
100%|██████████| 24/24 [00:00<00:00, 45.63it/s]
100%|██████████| 24/24 [00:02<00:00, 11.77it/s]
100%|██████████| 24/24 [00:02<00:00,  8.76it/s]
100%|██████████| 25/25 [00:03<00:00,  7.84it/s]
100%|██████████| 24/24 [00:03<00:00,  6.86it/s]
100%|██████████| 24/24 [00:03<00:00,  6.06it/s]
100%|██████████| 24/24 [00:00<00:00, 291.37it/s]
100%|██████████| 24/24 [00:00<00:00, 637.64it/s]
100%|██████████| 24/24 [00:01<00:00, 12.97it/s]
100%|██████████| 24/24 [00:03<00:00,  6.92it/s]
100%|██████████| 24/24 [00:02<00:00,  9.28it/s]
100%|██████████| 24/24 [00:02<00:00,  8.56it/s]
100%|██████████| 24/24 [00:03<00:00,

### Create Benign and Malacious Data Files

In [6]:

benign_dates = ['2024-09-01', '2024-09-02']
columns_to_keep = ['ts', 'src_ip', 'dst_ip']

all_df = pd.DataFrame(columns=columns_to_keep)

for date_folder in os.listdir(output_dir):
    date_folder_path = os.path.join(output_dir, date_folder)
    
    if os.path.isdir(date_folder_path) and date_folder.startswith('2024-09'):
        for file_name in os.listdir(date_folder_path):
            file_path = os.path.join(date_folder_path, file_name)
            
            if file_name.endswith('.csv'):
                try:
                    df = pd.read_csv(file_path, usecols=columns_to_keep)
                except ValueError:
                    continue
                
                all_df = pd.concat([all_df, df], ignore_index=True)
                
                

In [7]:
all_df.dropna(inplace=True)

In [8]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3736358 entries, 0 to 3736357
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   ts      float64
 1   src_ip  object 
 2   dst_ip  object 
dtypes: float64(1), object(2)
memory usage: 85.5+ MB


In [9]:
all_df.drop_duplicates(inplace=True)

In [10]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 793952 entries, 0 to 3736356
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ts      793952 non-null  float64
 1   src_ip  793952 non-null  object 
 2   dst_ip  793952 non-null  object 
dtypes: float64(1), object(2)
memory usage: 24.2+ MB


In [11]:
all_df = all_df.sort_values(by='ts')


In [12]:
# Define MST timezone
mst = pytz.timezone('US/Mountain')

# Convert 'ts' column from Unix timestamp to MST date
all_df['ts'] = pd.to_datetime(all_df['ts'], unit='s').dt.tz_localize('UTC').dt.tz_convert(mst)


In [13]:
# Define the cutoff date
start_date = pd.Timestamp('2024-09-01', tz=mst)
end_date = pd.Timestamp('2024-09-04', tz=mst)

# Filter the DataFrames
benign_df = all_df[(all_df['ts'] >= start_date) & (all_df['ts'] <= end_date)]
malicious_df = all_df[all_df['ts'] >= end_date]

In [14]:
benign_df.head()

Unnamed: 0,ts,src_ip,dst_ip
50049,2024-09-01 00:00:21.112210944-06:00,192.168.57.9,224.0.0.251
50050,2024-09-01 00:00:21.114290944-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
43179,2024-09-01 00:00:21.114968064-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
43177,2024-09-01 00:00:21.114968064-06:00,192.168.57.9,224.0.0.251
43181,2024-09-01 00:00:21.377451008-06:00,192.168.57.9,224.0.0.251


In [15]:
benign_df.tail()

Unnamed: 0,ts,src_ip,dst_ip
1061093,2024-09-03 23:58:51.216803072-06:00,192.168.57.4,192.168.255.255
1046900,2024-09-03 23:59:31.021046016-06:00,192.168.57.9,224.0.0.251
1046901,2024-09-03 23:59:31.021765120-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
1046902,2024-09-03 23:59:32.031281920-06:00,192.168.57.9,224.0.0.251
1046903,2024-09-03 23:59:32.031625984-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb


In [16]:
malicious_df

Unnamed: 0,ts,src_ip,dst_ip
1061245,2024-09-04 00:00:21.241051904-06:00,192.168.57.10,192.168.255.255
1061243,2024-09-04 00:00:29.579170816-06:00,192.168.57.9,224.0.0.251
1061244,2024-09-04 00:00:29.581031168-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
1047336,2024-09-04 00:00:29.581422080-06:00,192.168.57.9,224.0.0.251
1047338,2024-09-04 00:00:29.581625088-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
...,...,...,...
3558148,2024-09-07 17:59:33.521572096-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
3558149,2024-09-07 17:59:33.787298048-06:00,192.168.57.9,224.0.0.251
3558150,2024-09-07 17:59:33.787736832-06:00,fe80::33b2:2f9:4983:bf8e,ff02::fb
3558151,2024-09-07 17:59:33.788310016-06:00,192.168.57.9,224.0.0.251


In [17]:
# Save the DataFrames to CSV
benign_df.to_csv(f'{output_dir_all}/benign.csv', index=False)
malicious_df.to_csv(f'{output_dir_all}/malicious.csv', index=False)