### Import necessary libraries

In [None]:
import os
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import defaultdict
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

### Understanding data file

In [None]:
df = pd.read_csv('PaceIT_1.txt', sep="\t", header=None)
print(df.shape)
df.head()

### Analysis

In [None]:
# Define LogEntryAnalysis class

class LogEntryAnalysis:
    def __init__(self, filename):
        self.filename = filename
        if not os.path.exists(self.filename):
            raise ValueError(f"Error: {self.filename} does not exist.")

        # Reading the file
        self.dataframe = pd.read_csv(filename, sep="\t", header=None)

        # Set columns after reading
        columns = ["ID", "Timestamp", "Sender_Details", "Receiver_Details", "Priority", "Category", "MessageCode", "Info"]
        self.dataframe.columns = columns
        

    def parse_file(self):
        # Parsing timestamp
        self.dataframe["Parsed_Date"] = pd.to_datetime(self.dataframe["Timestamp"], format='%a %b %d %H:%M:%S %Y').dt.date
        self.dataframe["Parsed_Date"] = pd.to_datetime(self.dataframe["Parsed_Date"], errors='coerce')
        self.dataframe["Parsed_Time"] = pd.to_datetime(self.dataframe["Timestamp"], format='%a %b %d %H:%M:%S %Y').dt.time

        # Extracting details from sender details
        senders = self.dataframe["Sender_Details"].str.extract(r"CID: (\w+) IP: (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) SITE: (\w+)")
        self.dataframe[["Sender_CID", "Sender_IP", "Sender_SITE"]] = senders

        # Extracting details from receiver details
        receivers = self.dataframe["Receiver_Details"].str.extract(r"CID: (\w+) IP: (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) SITE: (\w+)")
        self.dataframe[["Receiver_CID", "Receiver_IP", "Receiver_SITE"]] = receivers

        # Dropping  original sender and rceiver details columns
        self.dataframe.drop(columns=["Sender_Details", "Receiver_Details"], inplace=True)
        
    def DTGAnalysis(self):
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x=self.dataframe["Parsed_Date"].dt.strftime('%A'), palette="Blues")
        plt.title("Frequency distribution of logs based on day")
        plt.xlabel("Day of the Week")
        plt.ylabel("Number of Logs")
        plt.grid(axis='y')
        plt.tight_layout()
        plt.show()

    def ComputerAnalysis(self):
        # Sender SITE visualization
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Sender_SITE", palette="Greens")
        plt.title('Distribution by Sender Computer Location (SITE)')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Receiver SITE visualization
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Receiver_SITE", palette="Reds")
        plt.title('Distribution by Receiver Computer Location')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # IP Prefix Visualization
        self.dataframe['IP_Prefix'] = self.dataframe['Sender_IP'].str.split('.').str[0]
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="IP_Prefix", palette="Oranges")
        plt.title('Frequency of Transmission by Computer IP Prefix')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    def MessageAnalysis(self):
        # Message Priority
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Priority", palette="Greys")
        plt.title('Frequency of Messages by Message Priority')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Corrupted Messages
        corrupted = self.dataframe[self.dataframe['MessageCode'].str.contains(r'[^01]', na=False)]
        plt.figure(figsize=(10, 6))
        sns.countplot(data=corrupted, x=corrupted["Parsed_Date"].dt.strftime('%A'), palette="Purples")
        plt.title('Frequency distribution of Corrupted Messages based on day')
        plt.xlabel("Day of the Week")
        plt.ylabel("Number of Corrupted Messages")
        plt.grid(axis='y')
        plt.tight_layout()
        
        plt.show()

        # Frequency of Message Sender or Receiver
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Sender_SITE", palette="Blues", order=self.dataframe['Sender_SITE'].value_counts().index)
        plt.title('Frequency of Message Sender Location')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Receiver_SITE", palette="Greens", order=self.dataframe['Receiver_SITE'].value_counts().index)
        plt.title('Frequency of Message Receiver Location')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Frequency of Message Category
        plt.figure(figsize=(10, 6))
        sns.countplot(data=self.dataframe, x="Category", palette="Reds")
        plt.title('Frequency of Messages by Message Category')
        plt.ylabel('Number of Logs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # List of Computer IDs by Computer Location
        sender_site_to_cid = defaultdict(list)
        for _, row in self.dataframe.iterrows():
            sender_site_to_cid[row['Sender_SITE']].append(row['Sender_CID'])
        print("List of Sender Computer IDs by Computer Location:")
        for site, cids in sender_site_to_cid.items():
            print(f"{site}: {', '.join(set(cids))}")
            
        receiver_site_to_cid = defaultdict(list)
        for _, row in self.dataframe.iterrows():
            receiver_site_to_cid[row['Receiver_SITE']].append(row['Receiver_CID'])
        print("\nList of Receiver Computer IDs by Computer Location:")
        for site, cids in receiver_site_to_cid.items():
            print(f"{site}: {', '.join(set(cids))}")

        # Identification of corrupted messages by LogEntry Number
        corrupted_messages = self.dataframe[self.dataframe['MessageCode'].str.contains(r'[^01]', na=False)]
        corrupted_log_ids = corrupted_messages['ID'].tolist()
        print("\nCorrupted Message Log IDs:", corrupted_log_ids)


In [None]:
def main():
    # The file path
    filename = "PaceIT_1.txt"

    # Create an instance of LogEntryAnalysis
    lea = LogEntryAnalysis(filename)

    # Parse the file to structure the data
    lea.parse_file()

    # Perform DTG Analysis
    print("Performing DTG Analysis...\n")
    lea.DTGAnalysis()

    # Perform Computer Analysis
    print("Performing Computer Analysis...\n")
    lea.ComputerAnalysis()

    # Perform Message Analysis
    print("Performing Message Analysis...\n")
    lea.MessageAnalysis()

if __name__ == "__main__":
    main()
