In [6]:
import re
import os
import pandas as pd
from typing import List

LOG_PATTERN = re.compile(
    r'^TID: \[(-?\d+)\] \[([^\]]*)\] \[([\d-]+ \d{2}:\d{2}:\d{2},\d{3})\]\s+'
    r'(DEBUG|INFO|WARN|ERROR)\s+'
    r'\{([^}]+)\}\s+-\s+(.*)$'
)

def extract_logs_from_file(path: str) -> List[dict]:
    entries = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            m = LOG_PATTERN.match(line.strip())
            if m:
                tid, logger, ts, level, context, message = m.groups()
                entries.append({
                    'TID': int(tid),
                    'logger': logger,
                    'timestamp': pd.to_datetime(ts, format='%Y-%m-%d %H:%M:%S,%f'),
                    'level': level,
                    'context': context,
                    'message': message
                })
    return entries

def extract_logs_from_dir(dirpath: str) -> pd.DataFrame:
    all_entries = []
    for fname in os.listdir(dirpath):
        full = os.path.join(dirpath, fname)
        if os.path.isfile(full):
            all_entries.extend(extract_logs_from_file(full))
    df = pd.DataFrame(all_entries)
    if not df.empty:
        df.sort_values('timestamp', inplace=True)
        df.reset_index(drop=True, inplace=True)
    return df

# Exemplo de uso:
if __name__ == '__main__':
    df = extract_logs_from_dir('../../data/raw')
    df.to_csv('../../data/structured/carbon.csv', index=False)
    display(df.head())


Unnamed: 0,TID,logger,timestamp,level,context,message
0,-1234,,2024-11-15 15:44:30.720,INFO,org.wso2.carbon.core.internal.CarbonCoreActivator,Starting WSO2 Carbon...
1,-1234,,2024-11-15 15:44:30.739,INFO,org.wso2.carbon.core.internal.CarbonCoreActivator,"Operating System : Linux 5.4.0-198-generic, amd64"
2,-1234,,2024-11-15 15:44:30.739,INFO,org.wso2.carbon.core.internal.CarbonCoreActivator,Java Home : /opt/java/openjdk
3,-1234,,2024-11-15 15:44:30.740,INFO,org.wso2.carbon.core.internal.CarbonCoreActivator,Java Version : 11.0.10
4,-1234,,2024-11-15 15:44:30.740,INFO,org.wso2.carbon.core.internal.CarbonCoreActivator,Java VM : OpenJDK 64-Bit Server VM 11...
