In [2]:
import pandas as pd
import glob

# Path to the folder containing .labeled files
file_pattern = '/home/charlie/Desktop/Test/*.labeled'

# List to store data from all files
dataframes = []

# Loop through each .labeled file
for file_path in glob.glob(file_pattern):
    print(f"Processing file: {file_path}")

    # Step 1: Read the file as lines and extract metadata
    with open(file_path, 'r') as f:
        raw_lines = f.readlines()

    # Step 2: Extract the #fields line for headers
    headers_line = next(line for line in raw_lines if line.startswith('#fields'))
    columns = headers_line.strip().split('\t')[1:]  # Extract column names

    # Step 3: Filter out metadata lines (starting with '#') and keep data rows
    data_lines = [line for line in raw_lines if not line.startswith('#')]

    # Step 4: Parse the data into a DataFrame
    data = pd.DataFrame([line.strip().split('\t') for line in data_lines], columns=columns)

    # Step 5: Clean up the combined column
    # Split the last combined column into three separate columns
    data[['tunnel_parents', 'label', 'detailed_label']] = data['tunnel_parents   label   detailed-label'].str.split('   ', expand=True)

    # Drop the original combined column
    data.drop(columns=['tunnel_parents   label   detailed-label'], inplace=True)

    # Append the cleaned DataFrame to the list
    dataframes.append(data)

# Step 6: Combine all data into a single DataFrame
combined_data = pd.concat(dataframes, ignore_index=True)

# Step 7: Save the combined DataFrame to a CSV file
output_path = "/home/charlie/Desktop/cleaned_master_data.csv"
combined_data.to_csv(output_path, index=False)

# Display a message to confirm completion
print(f"Cleaned data saved to {output_path}")

# Optional: Display the first few rows of the combined data
print(combined_data.head())


Processing file: /home/charlie/Desktop/Test/fec8f513f0f141ceafeb8a70add2be9e_conn.log.labeled
Processing file: /home/charlie/Desktop/Test/8e52afc9b42645659a0b961df6992a0d_conn.log.labeled
Processing file: /home/charlie/Desktop/Test/57fcf1602b8445bfad2c2555e99a0ca3_conn.log.labeled
Cleaned data saved to /home/charlie/Desktop/cleaned_master_data.csv
                  ts                 uid                  id.orig_h id.orig_p  \
0  1562159819.172195  CUg3u41D8SwgQw1Job                    0.0.0.0        68   
1  1562159849.173340  CUpk9Y381SJuOHyK2d            255.255.255.255        68   
2  1562159953.959669  CC5CQA3Ptzma7a0by4   fe80::5bcc:698e:39d5:cdf      5353   
3  1562159998.302954   C3sgcmfvNzjNoY0Kd   fe80::5bcc:698e:39d5:cdf      5353   
4  1562160024.472592   CJVaCgoBTEsu0jjDi  fe80::4eef:c0ff:fe27:561e      5353   

         id.resp_h id.resp_p proto service   duration orig_bytes  ...  \
0  255.255.255.255        67   udp    dhcp  30.004642       8768  ...   
1      192.168.1.

In [3]:
for i, df in enumerate(dataframes):
    print(f"DataFrame {i+1}:")
    display(df.head())

DataFrame 1:


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed_label
0,1562159819.172195,CUg3u41D8SwgQw1Job,0.0.0.0,68,255.255.255.255,67,udp,dhcp,30.004642,8768,...,-,0,D,16,9216,0,0,-,benign,-
1,1562159849.17334,CUpk9Y381SJuOHyK2d,255.255.255.255,68,192.168.1.1,67,udp,dhcp,0.004564,0,...,-,0,^d,0,0,13,4264,-,benign,-
2,1562159953.959669,CC5CQA3Ptzma7a0by4,fe80::5bcc:698e:39d5:cdf,5353,ff02::fb,5353,udp,dns,3.948539,876,...,-,0,D,6,1164,0,0,-,benign,-
3,1562159998.302954,C3sgcmfvNzjNoY0Kd,fe80::5bcc:698e:39d5:cdf,5353,ff02::fb,5353,udp,dns,3.768179,876,...,-,0,D,6,1164,0,0,-,benign,-
4,1562160024.472592,CJVaCgoBTEsu0jjDi,fe80::4eef:c0ff:fe27:561e,5353,ff02::fb,5353,udp,dns,0.000114,451,...,-,0,D,11,979,0,0,-,benign,-


DataFrame 2:


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed_label
0,1547150789.067208,CzsY0D4B96NTr8m7ld,192.168.1.199,59222,46.101.251.172,80,tcp,http,1.686784,149,...,-,11584,ShADadttfF,122,7741,122,178102,-,Malicious,C&C-FileDownload
1,1547150790.793723,CGEJbl3RNkmXzmkEd4,192.168.1.199,59224,46.101.251.172,80,tcp,-,3.081233,0,...,-,0,S,3,180,0,0,-,Malicious,C&C
2,1547150797.954878,CVMYDw4wnZfqWrOfd,192.168.1.199,59224,46.101.251.172,80,tcp,-,-,-,...,-,0,S,1,60,0,0,-,Malicious,C&C
3,1547150806.194937,CXKZmpx40udvITEN2,192.168.1.199,59224,46.101.251.172,80,tcp,-,-,-,...,-,0,S,1,60,0,0,-,Malicious,C&C
4,1547150822.835187,CBjcNy4pOh8Xg1H4S,192.168.1.199,59224,46.101.251.172,80,tcp,http,1.847943,149,...,-,4344,ShADadttfF,102,6485,121,178178,-,Malicious,C&C-FileDownload


DataFrame 3:


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed_label
0,1540469302.53864,CGm6jB4dXK71ZDWUDh,192.168.1.132,58687,216.239.35.4,123,udp,-,0.114184,48,...,-,0,Dd,1,76,1,76,-,benign,-
1,1540469197.400159,CnaDAG3n5r8eiG4su2,192.168.1.132,1900,239.255.255.250,1900,udp,-,160.367579,7536,...,-,0,D,24,8208,0,0,-,benign,-
2,1540469385.734089,CUrxU238nt0m6yTgKf,192.168.1.132,32893,216.239.35.8,123,udp,-,0.016986,48,...,-,0,Dd,1,76,1,76,-,benign,-
3,1540469831.302625,CGQf8t1kjdxB5PHXL4,192.168.1.132,53395,2.16.60.82,443,tcp,-,0.003497,0,...,-,0,ShAFf,5,212,3,144,-,benign,-
4,1540469831.265405,CUo9DH2QDnCaBIGjkg,192.168.1.132,52801,192.168.1.1,53,udp,dns,0.036724,34,...,-,0,Dd,1,62,1,339,-,benign,-
