In [1]:
import csv
import os
from collections import defaultdict

In [2]:
data_path = "data"
log_file_path = os.path.join(data_path, "flow_log.txt")
lookup_file_path = os.path.join(data_path, "lookup.csv")
protocol_numbers_file = os.path.join(data_path,"protocol-numbers.csv")

In [3]:
def load_log_list(log_file):
    log = []
    with open(log_file, "r") as f:
        for line in f:
            line = line.strip().split()
            log.append(line)
    return log

In [22]:
def load_lookup(lookup_file):
    # lookup = []
    lookup = {}
    with open(lookup_file, "r") as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row
        for port, pro, tag in reader:
            lookup[f"{port}_{pro}"] = tag
        # lookup = list(reader)
    return lookup

In [21]:
def protocol_numbers_conversion(protocol_numbers):
    protocol_dict ={}
    with open(protocol_numbers,'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row
        protocol_dict = {rows[0]:rows[1] for rows in reader}
    return protocol_dict

In [23]:
flow_log = load_log_list(log_file_path)

In [24]:
lookup_file = load_lookup(lookup_file_path)

In [25]:
lookup_file

{'25_tcp': 'sv_P1',
 '68_udp': 'sv_P2',
 '23_tcp': 'sv_P1',
 '31_udp': 'SV_P3',
 '443_tcp': 'sv_P2',
 '22_tcp': 'sv_P4',
 '3389_tcp': 'sv_P5',
 '0_icmp': 'sv_P5',
 '110_tcp': 'email',
 '993_tcp': 'email',
 '143_tcp': 'email'}

In [26]:
protocol_dict = protocol_numbers_conversion(protocol_numbers_file)

In [None]:
protocol_dict

In [9]:
# # dstport: 6th index
# # protocol: 7th index
# tag_counts = defaultdict(int)
# port_protocol_counts = defaultdict(int)
# port_protocol_output = []
# for log_record in flow_log:
#     dstport = log_record[6]
#     protocol_num = log_record[7]
#     protocol = protocol_dict[protocol_num]
#     flag = False
#     port_protocol_counts[str(dstport)+"_"+str(protocol)]+=1
#     for dstp,pro,tag in lookup_file:
#         if dstport == dstp and protocol.lower() == pro.lower():
#             tag_counts[tag] +=1
#             flag = True
#     if not flag:
#         tag_counts["Untagged"] +=1

# for key,value in port_protocol_counts.items():
#     port,protocol = key.split("_")
#     port_protocol_output.append([int(port),protocol,value])

In [33]:
# dstport: 6th index
# protocol: 7th index
tag_counts = defaultdict(int)
port_protocol_counts = defaultdict(int)
port_protocol_output = []
for log_record in flow_log:
    dstport = log_record[6]
    protocol_num = log_record[7]
    protocol = protocol_dict[protocol_num].lower()
    key = f"{dstport}_{protocol}"
    port_protocol_counts[key] += 1
    if key in lookup_file:
        tag_counts[lookup_file[key]] += 1
    else:
        tag_counts["Untagged"] +=1

for key,value in port_protocol_counts.items():
    port,protocol = key.split("_")
    port_protocol_output.append([int(port),protocol,value])

In [31]:
tag_counts

defaultdict(int, {'Untagged': 8, 'sv_P2': 1, 'sv_P1': 2, 'email': 3})

In [34]:
port_protocol_output

[[49153, 'tcp', 1],
 [49154, 'tcp', 1],
 [49155, 'tcp', 1],
 [49156, 'tcp', 1],
 [49157, 'tcp', 1],
 [49158, 'tcp', 1],
 [80, 'tcp', 1],
 [1024, 'tcp', 1],
 [443, 'tcp', 1],
 [23, 'tcp', 1],
 [25, 'tcp', 1],
 [110, 'tcp', 1],
 [993, 'tcp', 1],
 [143, 'tcp', 1]]

In [35]:
def data_to_output(filename,tag_output,port_protocol_output):
    with open(filename, 'w') as file:
        # 1st file: Tags and their counts
        file.write("Tag Counts: \n")
        file.write("Tag,Count\n")
        # Write the data rows
        for tag, count in tag_output.items():
            file.write(f"{tag},{count}\n")
        
        file.write("\n")

        #2nd file: Port, Protocol and their counts
        file.write("Port/Protocol Combination Counts:\n")
        file.write("Port,Protocol,Count\n")
        # Write the data rows
        for row in port_protocol_output:
            file.write(",".join(map(str, row)) + "\n")

In [36]:
data_to_output("output.txt",tag_counts,port_protocol_output)