
# Lab 9: Build a Log Aggregator

In this lab, you will create your own log generator, build a command-line utility that scans log files, summarizes their contents, and provides insight into system behavior. Data structures to track log message levels such as `INFO`, `WARNING`, `ERROR`, and `CRITICAL`.

This lab reinforces:
- File I/O
- Pattern recognition (regex)
- Dictionaries and counters
- Functions and modularity
- CLI arguments, logging



## Part 1: Create Log files (20%)
Using the the following example log format below create a **python file** that will log errors In a structured tree format 

You will find examples in the folder called Logs that you can use to build your program.

Remember set of logs should have a varied levels of log entries (`INFO`, `WARNING`, `ERROR`, `CRITICAL`) and tailored message types for different service components.
You must create 5 structured logs here are some examples:

    sqldb
    ui
    frontend.js
    backend.js
    frontend.flask
    backend.flask

You may use chat GPT to create sample outputs NOT THE LOGS. IE:

    System failure
    Database corruption
    Disk failure detected
    Database corruption


In [7]:
import sys
import os

def log_level_count(log_file):

    """Counts occurences"""

    level_count = {}

    try:
        with open(log_file, 'r') as opened_file:

            for line in opened_file:
                top_line = line.upper()
                
                if "INFO:" in top_line:
                    if "INFO" not in level_count:
                        level_count["INFO"] = 0
                    level_count["INFO"] += 1

                elif "WARNING:" in top_line:
                    if "WARNING" not in level_count:
                        level_count["WARNING"] = 0
                    level_count["WARNING", 0] += 1

                elif "ERROR:" in top_line:
                    if "ERROR" not in level_count:
                        level_count["ERROR"] = 0
                    level_count["ERROR", 0] += 1

                elif "CRITICAL:" in top_line:
                    if "CRITICAL" not in level_count:
                        level_count["CRITICAL"] = 0
                    level_count["CRITICAL", 0] += 1

    except FileNotFoundError:
        print(f"File not found: {log_file}")
        return {}
    
    except Exception as e:
        print(f"An error occurred when trying to the read the log files: {e}")
        return {}
    
    return level_count

def extract_messages(log_file):

    """Extracts messages"""

    messages = []

    try:
        with open(log_file, 'r') as opened_file:
            for each_line in opened_file:

                log_levels = ["INFO:", "WARNING:", "ERROR:", "CRITICAL:"]

                if "INFO:" in each_line or "WARNING:" in each_line or "ERROR:" in each_line or "CRITICAL:" in each_line:
                    
                    colon_position = each_line.find(":")

                    if colon_position != -1:
                        message = each_line[colon_position + 1:].strip()
                        messages.append(message)

    except FileNotFoundError:
        print(f"File not found: {log_file}")
        return []

    except Exception as e:
        print(f"An error occurred when trying to the read the log files: {e}")
        return []
    
    return messages

def summarize_logs(log_file):

    """Got help for the messages output format"""

    levels = log_level_count(log_file)
    messages = extract_messages(log_file)

    if not levels and not messages:
        print("No log data found or error reading file.")
        return

    print("Log Summary:")
    print("-------------")
    if levels:
        print("Log Level Counts:")
        for level_name, count_value in levels.items():
            print(f"{level_name}: {count_value}")
    else:
        print("No log levels found.")

    if messages:
        print("Sample Log Messages:")
        print("---------------")

        for message_index, the_message in enumerate(messages[:5]):
            print(f"  [{message_index+1}] {the_message}")

        if len(messages) > 5:
            print("  ...")
        print(f"Total Messages: {len(messages)}")

    else:
        print("No log messages found.")

def main():

    script_directory = os.getcwd()
    log_directory = os.path.join(script_directory, "Logs")

    if not os.path.exists(log_directory) or not os.path.isdir(log_directory):
        print(f"Error: Directory '{log_directory}' not found.")
        return

    log_files_to_analyze = [
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils_db.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\RSVP_Agent_processing.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\systemd_core_performance.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\systemd_core.log",
        r"C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\systemd.log",
    ]

    for log_file_name in log_files_to_analyze:

        log_file_path = os.path.join(log_directory, log_file_name)
        print(f"Debugging: Trying to access file at path: '{log_file_path}'")

        if os.path.exists(log_file_path):
            summarize_logs(log_file_path)

        else:
            print(f"Error: Log file '{log_file_path}' not found.")

if __name__ == "__main__":
    main()


Debugging: Trying to access file at path: 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils_db.log'
Error: Log file 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils_db.log' not found.
Debugging: Trying to access file at path: 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils.log'
Error: Log file 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app_utils.log' not found.
Debugging: Trying to access file at path: 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app.log'
Error: Log file 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\my_app.log' not found.
Debugging: Trying to access file at path: 'C:\Users\Shanto\Documents\GitHub\homeworkfolder-shant-bekverdyan\Labs\Lab9\Logs\RSVP_Agent_processing.log'
Error: Log file 'C:\Users\Shanto\Documents\GitHub\homew


### Example Log Format

You will work with logs that follow this simplified structure:

```
2025-04-11 23:20:36,913 | my_app | INFO | Request completed
2025-04-11 23:20:36,914 | my_app.utils | ERROR | Unhandled exception
2025-04-11 23:20:36,914 | my_app.utils.db | CRITICAL | Disk failure detected
```


In [8]:
def summarize_logs(log_file):

    """Got help for the messages output format"""

    levels = log_level_count(log_file)
    messages = extract_messages(log_file)

    if not levels and not messages:
        print("No log data found or error reading file.")
        return

    print("\nLog Summary:")
    print("-------------")
    if levels:
        print("\nLog Level Counts:")
        for level_name, count_value in levels.items():
            print(f"{level_name}: {count_value}")
    else:
        print("\nNo log levels found.")

    if messages:
        print("\nSample Log Messages:")
        print("---------------")

        for message_index, the_message in enumerate(messages[:5]):
            print(f"  [{message_index+1}] {the_message}")

        if len(messages) > 5:
            print("  ...")
        print(f"\nTotal Messages: {len(messages)}")

    else:
        print("\nNo log messages found.")

def main():

    if len(sys.argv) != 2:
        print("Usage: python log_analyzer.py <log_file>")
        return

    log_file_name = sys.argv[1]

    if not os.path.exists(log_file_name):
        print(f"Error: The file '{log_file_name}' does not exist.")
        return

    summarize_logs(log_file_name)

if __name__ == "__main__":
    main()


Error: The file '--f=c:\Users\Shanto\AppData\Roaming\jupyter\runtime\kernel-v3cd7d6d5abe9afbcb8b375ced9527e9bdeb0b1ad4.json' does not exist.


## Part 2: Logging the Log File (40%)
    New File
### Part 2a: Read the Log File (see lab 7) (10%)


Write a function to read the contents of a log file into a list of lines. Handle file errors gracefully.

### Part 2b: Parse Log Lines (see code below if you get stuck) (10%)

Use a regular expression to extract:
- Timestamp
- Log name
- Log level
- Message

### Part 2c: Count Log Levels (20%)

Create a function to count how many times each log level appears. Store the results in a dictionary. Then output it as a Json File
You may pick your own format but here is an example. 
```python
{
    "INFO": 
    {
        "Request completed": 42, 
        "Heartbeat OK": 7
    }

    "WARNING":
    {
        ...
    }
}

```


In [9]:
import sys
import os
import json

def process_log(log_file_path, output_file="log_counts.json"):

    level_counts = {}

    try:
        with open(log_file_path, 'r') as f:

            for line in f:
                parts = line.split(" | ")

                if len(parts) >= 4:
                    log_level = parts[2].strip().upper()
                    message = parts[3].strip()

                    if log_level in ["INFO", "WARNING", "ERROR", "CRITICAL", "DEBUG"]:
                        if log_level not in level_counts:
                            level_counts[log_level] = {}

                        if message not in level_counts[log_level]:
                            level_counts[log_level][message] = 0

                        level_counts[log_level][message] += 1

    except FileNotFoundError:
        print(f"Error: Log file not found at '{log_file_path}'")
        return
    
    except Exception as e:
        print(f"Error reading log file '{log_file_path}': {e}")
        return

    try:
        with open(output_file, 'w') as outfile:
            json.dump(level_counts, outfile, indent=4)
        print(f"Log level counts have been written to '{output_file}'")

    except Exception as e:
        print(f"Error writing to JSON file '{output_file}': {e}")

def count_log_levels(log_file):

    level_counts = {}

    try:
        with open(log_file, 'r') as opened_file:

            for line in opened_file:
                parts = line.split(" | ")

                if len(parts) >= 4:
                    log_level = parts[2].strip().upper()

                    if log_level in ["INFO", "WARNING", "ERROR", "CRITICAL", "DEBUG"]:

                        if log_level not in level_counts:
                            level_counts[log_level] = 0
                        level_counts[log_level] += 1

    except FileNotFoundError:
        print(f"Error: File not found - {log_file}")
        return {}
    
    except Exception as e:
        print(f"An error occurred while reading the log file: {e}")
        return {}
    
    return level_counts

def extract_messages(log_file):

    the_messages = []

    try:
        with open(log_file, 'r') as opened_file:

            for each_line in opened_file:
                parts = each_line.split(" | ")

                if len(parts) >= 4:
                    the_message = parts[3].strip()
                    the_messages.append(the_message)

    except FileNotFoundError:
        print(f"Error: File not found --> {log_file}")
        return []
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
    
    return the_messages

def summarize_logs(log_file):

    print(f"Analyzing log file: {log_file}")

    levels = count_log_levels(log_file)
    messages = extract_messages(log_file)

    if not levels and not messages:
        print("No log data found or error reading file.")
        return

    print("Log Summary:")
    print("-------------")

    if levels:
        print("Log Level Counts:")
        for level_name, count_value in levels.items():
            print(f"{level_name}: {count_value}")

    else:
        print("No log levels found.")

    if messages:

        print("Sample Log Messages:")

        for message_index, the_message in enumerate(messages[:5]):
            print(f"  [{message_index+1}] {the_message}")

        if len(messages) > 5:
            print("  ...")
        print(f"Total Messages: {len(messages)}")

    else:
        print("No log messages found.")

def main():

    """Got help for this part"""

    if len(sys.argv) != 2:
        print("Usage: python log_analyzer.py <log_file>")
        return

    log_file_name = sys.argv[1]

    if not os.path.exists(log_file_name):
        print(f"Error: The file '{log_file_name}' does not exist.")
        return

    process_log(log_file_name)
    summarize_logs(log_file_name)

if __name__ == "__main__":
    main()


Error: The file '--f=c:\Users\Shanto\AppData\Roaming\jupyter\runtime\kernel-v3cd7d6d5abe9afbcb8b375ced9527e9bdeb0b1ad4.json' does not exist.



## Step 3: Generate Summary Report (40%)
    New File
### Step 3a (20%):
 Develop a function that continuously monitors your JSON file(s) and will print a real-time summary of log activity. It should keep count of the messages grouped by log level (INFO, WARNING, ERROR, CRITICAL) and display only the critical messages. (I.e. If new data comes in the summary will change and a new critical message will be printed)
 - note: do not reprocess the entire file on each update.  

### Step 3a: Use a Matplotlib (Lecture 10) (20%)
Develop a function that continuously monitors your JSON file(s) and will graph in real-time a bar or pie plot of each of the errors.  (a graph for each log level). 
- The graph should show the distribution of log messages by level  (INFO, WARNING, ERROR, CRITICAL)  


### Critical notes:
- Your code mus use Daemon Threads (Lecture 14)
- 3a and 3b do not need to run at the same time. 


In [None]:
import time
import json
import threading

json_file_path = "log_counts.json"

def monitor_log_activity():
    previous_data = {}
    critical_messages = []

    while True:
        try:
            with open(json_file_path, 'r') as log_file:

                try:
                    current_data = json.load(log_file)

                except json.JSONDecodeError:
                    print("Error: Invalid JSON format in log file. Waiting for valid data...")
                    time.sleep(1)
                    continue

            if current_data != previous_data:
                print("Log Activity Update:")
                print("-" * 20)

                total_messages = 0

                for level, messages in current_data.items():

                    count = sum(messages.values())
                    print(f"{level}: {count} messages")
                    total_messages += count

                print(f"Total Messages: {total_messages}")

                new_critical_messages = []

                if "CRITICAL" in current_data:

                    current_critical_messages = list(current_data["CRITICAL"].keys())

                    new_critical_messages = [
                        msg for msg in current_critical_messages if msg not in critical_messages
                    ]

                    critical_messages.extend(new_critical_messages)

                    if new_critical_messages:

                        print("New Critical Messages:")
                        for msg in new_critical_messages:
                            print(f"  - {msg}")

                previous_data = current_data

        except FileNotFoundError:
            print(f"Error: JSON file not found at {json_file_path}.")
            time.sleep(1)
            continue

        except Exception as e:
            print(f"Error monitoring log activity: {e}")
            time.sleep(1)

        time.sleep(1)


def main():

    print("Starting Log Activity Monitor...")
    activity_thread = threading.Thread(target=monitor_log_activity, daemon=True)
    activity_thread.start()

    while True:
        time.sleep(1)

    print("Done")


if __name__ == "__main__":
    main()


Starting Log Activity Monitor...
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.
Error: JSON file not found at log_counts.json.


In [None]:
# Here is a sample regex that parses a log file and extracts relevant information. 
# you will need to modify it. Review Lecture 11
import re

def parse_log_line(line):
    pattern = r"^(.*?)\s\|\s(\w+)\s\|\s(\w+)\s\|\s(.*)$"
    match = re.match(pattern, line)
   
