In [1]:
import re
import json
import base64
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def parse_logs(log_file):
    """
    Parses logs with mixed formats, including error messages, Base64-encoded, and text-based logs.
    Returns a structured DataFrame.
    """
    data = []
    skipped_lines = []

    with open(log_file, 'r') as f:
        for line in f:
            line = line.strip()
            entry = {}

            # Handle JSON logs
            try:
                if line.startswith("{") and line.endswith("}"):
                    json_data = json.loads(line)
                    entry.update(json_data)

                    # Decode nested 'details' if present
                    if "details" in entry and isinstance(entry["details"], dict):
                        entry.update(entry["details"])
                        del entry["details"]

                    data.append(entry)
                    continue
            except json.JSONDecodeError:
                pass

            # Handle Base64-encoded logs
            if line.startswith("BASE64:"):
                try:
                    encoded_data = line[7:].strip()
                    decoded_json = json.loads(base64.b64decode(encoded_data).decode("utf-8"))
                    entry.update(decoded_json)

                    # Decode 'details' if it exists
                    if "details" in entry and isinstance(entry["details"], dict):
                        entry.update(entry["details"])
                        del entry["details"]

                    data.append(entry)
                    continue
                except (base64.binascii.Error, json.JSONDecodeError):
                    skipped_lines.append(f"Invalid Base64 log: {line}")
                    continue

            # Handle error logs with patterns like '<Error> at <timestamp>'
            error_match = re.search(r'(?P<Error>.+) at (?P<Timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+)', line)
            if error_match:
                entry["Error"] = error_match.group("Error")
                entry["Timestamp"] = error_match.group("Timestamp")
                data.append(entry)
                continue

            # Handle logs with errors and timestamps in reverse order
            reverse_error_match = re.search(r'(?P<Timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+)\s(?P<Error>.+)', line)
            if reverse_error_match:
                entry["Timestamp"] = reverse_error_match.group("Timestamp")
                entry["Error"] = reverse_error_match.group("Error")
                data.append(entry)
                continue

            # Handle action-based logs (e.g., user_423 performed purchase)
            action_match = re.search(r'(\w+)\sperformed\s(\w+)\sat\s(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+)', line)
            if action_match:
                entry["User"] = action_match.group(1)
                entry["Action"] = action_match.group(2)
                entry["Timestamp"] = action_match.group(3)
                data.append(entry)
                continue

            # If no format matches, skip the line
            skipped_lines.append(f"Malformed log: {line}")

    # Convert parsed data to DataFrame
    df = pd.DataFrame(data)

    # Log skipped lines summary
    if skipped_lines:
        print(f"Skipped lines due to parsing issues: {len(skipped_lines)} skipped lines.")
        print("Sample skipped lines:")
        for skipped_line in skipped_lines[:10]:  # Show the first 10 skipped lines only
            print(skipped_line)

    # Ensure 'Timestamp' column exists and is properly formatted
    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    else:
        print("Warning: No valid 'Timestamp' column found. Data may be incomplete.")

    return df

In [3]:
def create_visualizations(data):
    """
    Generates visualizations for the parsed data.
    Handles non-numeric columns by plotting their counts over time.
    """
    if 'Timestamp' not in data.columns:
        print("Error: 'Timestamp' column is missing. Cannot create time-based visualizations.")
        return

    # Combine 'Timestamp' and 'timestamp' into a single column (if needed)
    data['Final_Timestamp'] = data['Timestamp'].fillna(data.get('timestamp', pd.NaT))
    data.drop(columns=['Timestamp', 'timestamp'], inplace=True)

    # Drop rows without timestamps
    data = data.dropna(subset=['Final_Timestamp'])

    # Convert 'Final_Timestamp' to datetime
    data['Final_Timestamp'] = pd.to_datetime(data['Final_Timestamp'], errors='coerce')

    # Categorical Metrics (non-numeric columns)
    non_numeric_metrics = ['Error', 'user', 'ip', 'event']

    # Plot non-numeric metrics
    for metric in non_numeric_metrics:
        if metric in data.columns:
            plt.figure(figsize=(12, 8))

            # Count occurrences of each category over time
            counts = data.groupby(['Final_Timestamp', metric]).size().reset_index(name='Count')
            counts_pivot = counts.pivot(index='Final_Timestamp', columns=metric, values='Count').fillna(0)

            # Plot as a stacked bar chart
            counts_pivot.plot(kind='bar', stacked=True, figsize=(12, 8))
            plt.title(f'{metric} Counts Over Time')
            plt.xlabel('Time')
            plt.ylabel('Count')
            plt.legend(title=metric)
            plt.grid(axis='y')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

    # Numeric Metrics (e.g., quantity, price)
    numeric_metrics = ['quantity', 'price']
    for metric in numeric_metrics:
        if metric in data.columns:
            plt.figure(figsize=(10, 6))
            plt.plot(data['Final_Timestamp'], data[metric], label=f'{metric} over Time', color='blue')
            plt.title(f'{metric} Over Time')
            plt.xlabel('Time')
            plt.ylabel(metric)
            plt.legend()
            plt.grid(True)
            plt.show()

In [4]:
# Provide the path to your log file
log_file_path = 'assignment_prod.log'

# Parse the logs
parsed_data = parse_logs(log_file_path)

# Inspect the parsed data
print("Parsed Data:")
print(parsed_data.head())
print("Columns in DataFrame:", parsed_data.columns)


Skipped lines due to parsing issues: 1572 skipped lines.
Sample skipped lines:
Malformed log: NullPointerException at line 42 2024-11-18T18:18:45.994516
Malformed log: NullPointerException at line 42 2024-11-18T10:35:07.994582
Malformed log: Malformed JSON object 2024-11-19T14:58:37.994586
Malformed log: InvalidBase64: Data cannot be decoded 2024-11-18T11:57:39.994588
Malformed log: TimeoutError: Connection to DB failed 2024-11-18T20:38:31.994617
Malformed log: TimeoutError: Connection to DB failed 2024-11-19T08:46:04.994686
Invalid Base64 log: BASE64:eyJ1c2VyIjogInVzZXJfNDE2IiwgInRpbWVzdGFtcCI6ICIyMDI0LTExLTE4VDE0OjM5OjAxLjk5NDY4OCIsICJpcCI6ICIyMTcuMjMxLjE0MS42MyIsICJldmVudCI6ICJsb2dvdXQiLCAiZGV0YWlscyI6IHsiaXRlbV9pZCI6IDgzOTIsICJxdWFudGl0eSI6IDUsICJwcmljZSI6IDYzNC45
Malformed log: IndexOutOfBoundsException in module user_activity 2024-11-19T14:25:45.994726
Malformed log: KeyError: 'action_type' 2024-11-19T13:15:04.994728
Malformed log: TimeoutError: Connection to DB failed 2024-11-18

In [None]:
# Create visualizations
create_visualizations(parsed_data)