In [2]:
# install dependencies
# %pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import os
import pandas as pd

In [4]:
# Extract different logs.

# Grab the datasets
log_files = [
    'raw/openstack_test.log',
    'raw/openstack_train.log',
    'raw/openstack_predict.log'
]

log_pattern = re.compile(
    r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)\s+"  # Matches the timestamp
    r"\d+\s+"                                                      # Process ID (ignored for now)
    r"(?P<log_level>[A-Z]+)\s+"                                    # Matches the log level
    r"(?P<source>[^\s]+)\s+"                                       # Matches the source
    r"(?:.*?req-(?P<request_id>[a-f0-9\-]+)\s+(?P<user_id>[a-f0-9\-]+)\s+(?P<project_id>[a-f0-9\-]+)\s+)?" # Captures request_id, user_id, project_id
    r"(?:.*?\s+(?P<internal_ip>\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b))?" # Matches internal_ip
    r"(?:.*?\"(?P<request>[^\"]+)\")?\s*"                          # matches the request
    r"(?:status:\s+(?P<status_code>\d+))?"                         # Optionally matches the status code
    r"(?:.*?len:\s+(?P<response_length>\d+))?\s*"                  # matches response length
    r"(?:.*?time:\s+(?P<response_time_api>[0-9\.]+))?"             # Matches API response time
    r"(?:.*?Took\s+(?P<response_time_compute>[0-9\.]+)\s+seconds)?"  # Matches compute response time
)

# Function to process each line of the log
def process_line(line):
    match = log_pattern.search(line)
    if match:
        data = match.groupdict()
        
        # Merge both response times into a single field
        data["response_time"] = data["response_time_api"] or data["response_time_compute"]
        
        # Remove intermediate response time fields
        del data["response_time_api"]
        del data["response_time_compute"]
        return data
    
    return {
        "timestamp": None, "log_level": None, "source": None, "request_id": None,
        "user_id": None, "project_id": None,
        "request": None, "status_code": None,"response_length": None,
        "response_time": None,
        "internal_ip": None, 
    }  # Return None for unmatched lines


# Parse logs to DataFrame
def parse_logs_to_dataframe(file_path, output_dir = 'extracted'):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    results = [process_line(line) for line in lines]
     
    file_name = os.path.basename(file_path)
    output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}_extracted.csv")
    
    # Parse logs to DataFrame
    df = pd.DataFrame(results)

    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Processed log saved to: {output_file}")
        
# Main script execution
if __name__ == "__main__":
    for log_file in log_files:
        parse_logs_to_dataframe(log_file)

Processed log saved to: extracted/openstack_test_extracted.csv
Processed log saved to: extracted/openstack_train_extracted.csv
Processed log saved to: extracted/openstack_predict_extracted.csv
