In [9]:
import logging
import editdistance
from collections import defaultdict
from tqdm import tqdm

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
log_file_path = "C:/Users/dcave/Documents/DIS_project/DIS_project/data/server_log.txt"
part1_output_path = 'C:/Users/dcave/Documents/DIS_project/part1Output.txt'
part1_observations_path = 'C:/Users/dcave/Documents/DIS_project/part1Observations.txt'
part2_observations_path = 'C:/Users/dcave/Documents/DIS_project/part2Observations.txt'

logging.info("Starting to read the log file")

# Read log file into a list of dictionaries (first 1000 lines)
log_entries = []
with open(log_file_path, 'r') as file:
    for i, line in enumerate(file):
        if i >= 200000:
            break
        from_server, to_server, time, action, process_id = line.strip().split(',')
        log_entries.append({
            "from_server": from_server,
            "to_server": to_server,
            "time": int(time),
            "action": action.strip(),
            "process_id": int(process_id)
        })

logging.info("Completed reading and parsing the log file")

# Group by process_id and collect log entries
processes = defaultdict(list)
for entry in log_entries:
    processes[entry["process_id"]].append((entry["from_server"], entry["to_server"], entry["time"], entry["action"]))

logging.info("Grouped by process_id and collected log entries")

# Convert events to server sequences ignoring timestamps
def concat_events(events):
    return ["{}_{}".format(event[0], event[1]) for event in events]

process_events_dict = {pid: (concat_events(events), events) for pid, events in processes.items()}

# Function to calculate edit distance using the editdistance package
def calculate_edit_distance(seq1, seq2):
    return editdistance.eval(seq1, seq2)

# Generate candidate pairs for part 2 (edit distance < 4)
candidate_pairs_part2 = []
for pid1 in tqdm(process_events_dict):
    for pid2 in process_events_dict:
        if pid1 < pid2:
            seq1 = process_events_dict[pid1][0]
            seq2 = process_events_dict[pid2][0]
            if calculate_edit_distance(seq1, seq2) < 4:
                candidate_pairs_part2.append((pid1, pid2))

logging.info("Filtered candidates for part 2 based on edit distance < 4")

# Further filter the part 2 candidates for part 1 (edit distance < 2)
candidate_pairs_part1 = [pair for pair in candidate_pairs_part2 if calculate_edit_distance(process_events_dict[pair[0]][0], process_events_dict[pair[1]][0]) < 2]

logging.info("Filtered candidates for part 1 based on edit distance < 2")

# Group similar pairs to form clusters
def merge_clusters(clusters):
    merged_groups = []
    seen = set()
    for key, group in clusters.items():
        if key not in seen:
            merged_group = {key} | set(group)
            to_merge = [g for g in merged_groups if g & merged_group]
            for g in to_merge:
                merged_group |= g
                merged_groups.remove(g)
            merged_groups.append(merged_group)
            seen.update(merged_group)
    return merged_groups

def form_clusters(candidate_pairs):
    clusters_dict = defaultdict(set)
    for k, v in candidate_pairs:
        clusters_dict[k].add(v)
        clusters_dict[v].add(k)
    return merge_clusters(clusters_dict)

merged_groups_part1 = form_clusters(candidate_pairs_part1)
merged_groups_part2 = form_clusters(candidate_pairs_part2)

# Log number of clusters
num_clusters_part1 = len(merged_groups_part1)
num_clusters_part2 = len(merged_groups_part2)
logging.info(f"Number of clusters for part 1: {num_clusters_part1}")
logging.info(f"Number of clusters for part 2: {num_clusters_part2}")

# Calculate and log average cluster size
average_cluster_size_part1 = sum(len(group) for group in merged_groups_part1) / num_clusters_part1
average_cluster_size_part2 = sum(len(group) for group in merged_groups_part2) / num_clusters_part2
logging.info(f"Average cluster size for part 1: {average_cluster_size_part1}")
logging.info(f"Average cluster size for part 2: {average_cluster_size_part2}")

logging.info("Merged similar groups into clusters")

# Function to get the representative process
def get_representative_process(group):
    min_total_distance = float('inf')
    representative_process = None
    for pid1 in group:
        total_distance = 0
        for pid2 in group:
            if pid1 != pid2:
                total_distance += calculate_edit_distance(process_events_dict[pid1][0], process_events_dict[pid2][0])
        if total_distance < min_total_distance:
            min_total_distance = total_distance
            representative_process = pid1
    return representative_process

# Generate part1Output.txt
def generate_part1_output(merged_groups, output_file_path):
    with open(output_file_path, 'w') as file:
        new_id = 1
        for group in merged_groups:
            representative_process = get_representative_process(group)
            merged_pids = ",".join(map(str, sorted(group)))
            file.write(f"{new_id}:{merged_pids}\n")
            _, events = process_events_dict[representative_process]
            for event in events:
                file.write(f"<{event[0]}, {event[1]}, {event[2]}, {event[3]}, {representative_process}>\n")
            new_id += 1
    logging.info(f"Generated part1Output.txt: {output_file_path}")

generate_part1_output(merged_groups_part1, part1_output_path)

# Generate part1Observations.txt
def generate_part1_observations(merged_groups, output_file_path):
    with open(output_file_path, 'w') as file:
        for group in merged_groups:
            file.write(f"Group: {sorted(group)}\n")
            for pid in sorted(group):
                file.write(f"{pid}:\n")
                _, events = process_events_dict[pid]
                for event in events:
                    file.write(f"<{event[0]}, {event[1]}, {event[2]}, {event[3]}, {pid}>\n")
            file.write("\n")
    logging.info(f"Generated part1Observations.txt: {output_file_path}")

generate_part1_observations(merged_groups_part1, part1_observations_path)

# Generate part2Observations.txt
def generate_part2_observations(merged_groups, output_file_path):
    with open(output_file_path, 'w') as file:
        for group in merged_groups:
            file.write(f"Group: {sorted(group)}\n")
            for pid in sorted(group):
                file.write(f"{pid}:\n")
                _, events = process_events_dict[pid]
                for event in events:
                    file.write(f"<{event[0]}, {event[1]}, {event[2]}, {event[3]}, {pid}>\n")
            file.write("\n")
    logging.info(f"Generated part2Observations.txt: {output_file_path}")

generate_part2_observations(merged_groups_part2, part2_observations_path)

logging.info("Process completed.")



2024-06-29 11:45:05,292 - INFO - Starting to read the log file
2024-06-29 11:45:05,482 - INFO - Completed reading and parsing the log file
2024-06-29 11:45:05,535 - INFO - Grouped by process_id and collected log entries
100%|████████████████████████████████████████████████████████████████████████████| 20931/20931 [04:05<00:00, 85.19it/s]
2024-06-29 11:49:11,530 - INFO - Filtered candidates for part 2 based on edit distance < 4
2024-06-29 11:49:32,768 - INFO - Filtered candidates for part 1 based on edit distance < 2
2024-06-29 11:49:42,041 - INFO - Number of clusters for part 1: 80
2024-06-29 11:49:42,042 - INFO - Number of clusters for part 2: 7
2024-06-29 11:49:42,042 - INFO - Average cluster size for part 1: 261.4875
2024-06-29 11:49:42,043 - INFO - Average cluster size for part 2: 2990.0
2024-06-29 11:49:42,043 - INFO - Merged similar groups into clusters
2024-06-29 11:49:54,393 - INFO - Generated part1Output.txt: C:/Users/dcave/Documents/DIS_project/part1Output.txt
2024-06-29 11:4