In [None]:

# Extractor v2.0 (Regex + SVM Edition)

import os
import zipfile
import joblib
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from google.colab import files
from tqdm import tqdm
import re

# Setup
DATA_DIR = "LogHub"
MODEL_FILE = "timestamp_svm_model.pkl"

# Ask user whether to download LogHub dataset
print("Extractor v2.0 (Regex + SVM Edition) =====")
print("This tool first tries Regex and then uses SVM to extract timestamps from log files.")
print("Running in Google Colab environment. File upload enabled.")
download = input("Do you want to download the LogHub dataset for enhanced training? (y/n, default: n): ").lower().strip() or 'n'

# Download and extract LogHub if needed
if download == 'y':
    print("Downloading LogHub dataset...")
    !git clone https://github.com/logpai/loghub.git
    os.rename("loghub", DATA_DIR)

# Show available datasets
datasets = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
print("Available log datasets:", datasets)

# Combine all logs into one file
combined_logs = []
for ds in tqdm(datasets, desc="Processing datasets"):
    for root, _, files_list in os.walk(os.path.join(DATA_DIR, ds)):
        for file in files_list:
            file_path = os.path.join(root, file)
            with open(file_path, errors='ignore') as f:
                combined_logs += f.readlines()

with open("combined_logs.log", "w") as f:
    f.writelines(combined_logs)

print(f"Created combined dataset with {len(combined_logs)} lines")

# Tokenize log lines
def extract_tokens(lines):
    tokens = set()
    for line in lines:
        tokens.update(line.strip().split())
    return list(tokens)

# Train or load SVM model
if os.path.exists(MODEL_FILE):
    print("Loading existing SVM model from", MODEL_FILE)
    model, vectorizer = joblib.load(MODEL_FILE)
    print("Model loaded successfully")
else:
    print("Training new SVM model...")
    positive = [
    # ISO 8601 formats
    '2023-05-14T10:30:45Z',
    '2023-05-14T10:30:45.123Z',
    '2023-05-14T10:30:45+05:30',
    '2023-05-14T10:30:45-08:00',
    '2023-05-14 10:30:45',
    '2023-05-14 10:30:45.123',

    # Standard date formats
    '2023-05-14',
    '2023/05/14',
    '05/14/2023',
    '14/05/2023',
    '05-14-2023',
    '14-05-2023',
    '2023.05.14',
    '05.14.2023',
    '14.05.2023',

    # Time formats
    '10:30:45',
    '10:30:45.123',
    '10:30:45.123456',
    '22:45:30',
    '00:00:00',
    '23:59:59',
    '1:30:45',
    '01:30:45 AM',
    '01:30:45 PM',
    '1:30 AM',
    '1:30 PM',
    '13:30',
    '01:30',

    # Combined date-time formats
    '2023-05-14 10:30:45',
    '2023/05/14 10:30:45',
    '05/14/2023 10:30:45',
    '14/05/2023 10:30:45',
    '2023-05-14 10:30:45.123',
    '2023-05-14 22:45:30.456789',
    '2023-05-14T10:30:45',

    # Unix timestamp variations
    '1684056645',
    '1684056645.123',
    '1684056645123',

    # RFC formats
    'Mon, 14 May 2023 10:30:45 GMT',
    'Mon, 14 May 2023 10:30:45 +0000',
    'Monday, 14-May-23 10:30:45 GMT',
    'Mon May 14 10:30:45 2023',
    'Mon May 14 10:30:45 UTC 2023',

    # Month name formats
    'May 14, 2023',
    'May 14 2023',
    '14 May 2023',
    '14-May-2023',
    'May-14-2023',
    'May 14, 2023 10:30:45',
    '14 May 2023 10:30:45',

    # Syslog formats
    'May 14 10:30:45',
    'May  1 10:30:45',
    'Dec 25 23:59:59',
    'Jan  1 00:00:00',

    # Apache/Nginx log formats
    '[14/May/2023:10:30:45 +0000]',
    '[14/May/2023:10:30:45 -0800]',

    # Windows event log formats
    '5/14/2023 10:30:45 AM',
    '5/14/2023 10:30:45 PM',
    '12/25/2023 11:59:59 PM',

    # Different year formats
    '23-05-14',
    '14-05-23',
    '05/14/23',
    '14/05/23',
    '23/05/14',

    # With milliseconds/microseconds
    '2023-05-14 10:30:45.123456',
    '2023-05-14T10:30:45.123456Z',
    '10:30:45.123456',
    '10:30:45,123',
    '10:30:45,123456',

    # Special formats
    '2023-05-14T10:30:45.123+05:30',
    '2023-05-14T10:30:45.123-08:00',
    '2023-05-14 10:30:45 UTC',
    '2023-05-14 10:30:45 EST',
    '2023-05-14 10:30:45 PST',
    '2023-05-14 10:30:45 GMT',

    # Compact formats
    '20230514',
    '20230514103045',
    '230514',
    '230514103045',

    # Database formats
    '2023-05-14 10:30:45.123000',
    '2023-05-14 10:30:45.123000+00:00',

    # Custom application formats
    '2023_05_14_10_30_45',
    '2023.05.14.10.30.45',
    '14MAY2023:10:30:45',
    '14MAY23:10:30:45',

    # Edge cases
    '2000-01-01 00:00:00',
    '2099-12-31 23:59:59',
    '1970-01-01 00:00:00',
    '2038-01-19 03:14:07',
    ]

    negative = [
    # Log levels
    'ERROR',
    'WARN',
    'WARNING',
    'INFO',
    'DEBUG',
    'TRACE',
    'FATAL',
    'CRITICAL',
    'NOTICE',

    # Common log words
    'login',
    'logout',
    'user',
    'admin',
    'failed',
    'success',
    'connected',
    'disconnected',
    'timeout',
    'error',
    'exception',
    'warning',
    'information',

    # Server/Service names
    'apache',
    'nginx',
    'mysql',
    'postgresql',
    'redis',
    'mongodb',
    'elasticsearch',
    'kibana',
    'logstash',
    'docker',
    'kubernetes',
    'jenkins',

    # Network related
    'localhost',
    '127.0.0.1',
    '192.168.1.1',
    '10.0.0.1',
    'http',
    'https',
    'ftp',
    'ssh',
    'tcp',
    'udp',
    'dns',
    'dhcp',

    # File paths and extensions
    '/var/log/syslog',
    '/etc/passwd',
    '/home/user',
    '.log',
    '.txt',
    '.conf',
    '.json',
    '.xml',
    '.csv',

    # Process IDs and thread IDs
    'PID',
    'TID',
    'pid',
    'tid',
    'thread',
    'process',

    # HTTP status codes and methods
    '200',
    '404',
    '500',
    '403',
    '401',
    'GET',
    'POST',
    'PUT',
    'DELETE',
    'HEAD',
    'OPTIONS',

    # Database operations
    'SELECT',
    'INSERT',
    'UPDATE',
    'DELETE',
    'CREATE',
    'DROP',
    'ALTER',
    'COMMIT',
    'ROLLBACK',

    # System operations
    'start',
    'stop',
    'restart',
    'reload',
    'shutdown',
    'boot',
    'startup',
    'init',

    # Memory and disk
    'MB',
    'GB',
    'KB',
    'memory',
    'disk',
    'cpu',
    'load',
    'usage',

    # Application specific
    'session',
    'token',
    'auth',
    'authentication',
    'authorization',
    'permission',
    'access',
    'denied',
    'granted',

    # Numbers that look like timestamps but aren't
    '123456',
    '999999',
    '000000',
    '12345678',
    '87654321',

    # Version numbers
    'v1.0.0',
    'version',
    '2.4.1',
    '1.2.3',

    # Configuration values
    'true',
    'false',
    'null',
    'undefined',
    'none',
    'empty',

    # Common abbreviations
    'etc',
    'var',
    'tmp',
    'usr',
    'bin',
    'lib',
    'opt',
    'srv',

    # Protocol and encoding
    'utf-8',
    'ascii',
    'unicode',
    'base64',
    'json',
    'xml',
    'yaml',

    # Security related
    'ssl',
    'tls',
    'certificate',
    'key',
    'hash',
    'encrypt',
    'decrypt',

    # Performance metrics
    'latency',
    'throughput',
    'response_time',
    'duration',
    'elapsed',

    # Email and web
    'email',
    'mail',
    'smtp',
    'pop3',
    'imap',
    'html',
    'css',
    'javascript',

    # False positive patterns that might look like dates
    'log-2023',
    'file-2023',
    'backup-2023',
    'config-2023',
    'temp-2023',

    # Port numbers
    '8080',
    '3306',
    '5432',
    '6379',
    '9200',
    '22',
    '80',
    '443',

    # UUIDs (parts that might be mistaken)
    'uuid',
    'guid',
    'identifier',

    # Common file names
    'syslog',
    'messages',
    'secure',
    'auth.log',
    'error.log',
    'access.log',

    # Programming languages and frameworks
    'python',
    'java',
    'javascript',
    'php',
    'ruby',
    'perl',
    'shell',
    'bash',

    # Operating systems
    'linux',
    'windows',
    'macos',
    'ubuntu',
    'centos',
    'debian',
    'fedora',

    # Cloud providers and services
    'aws',
    'azure',
    'gcp',
    'docker',
    'kubernetes',
    'terraform',
    ]
    labels = [1]*len(positive) + [0]*len(negative)
    samples = positive + negative
    vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 4))
    X = vectorizer.fit_transform(samples)
    X, labels = shuffle(X, labels, random_state=42)
    model = SVC(probability=True)
    model.fit(X, labels)
    joblib.dump((model, vectorizer), MODEL_FILE)
    print("Model trained and saved.")

# Upload log file
print("\n===== Upload Your Log File =====")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"Successfully uploaded: {filename} ({len(uploaded[filename])} bytes)")

# Read log lines
with open(filename, "r", errors='ignore') as f:
    lines = f.readlines()

# Regex extractor
def extract_with_regex(line):
    timestamp_pattern = r'(\d{4}[-/.]\d{1,2}[-/.]\d{1,2}(?:[ T]\d{1,2}:\d{2}(?::\d{2})?(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2}|[ ]?[A-Z]{2,4})?)?|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}(?:[ T]\d{1,2}:\d{2}(?::\d{2})?(?:\.\d+)?(?:[ ]?[AP]M|Z|[+-]\d{2}:?\d{2}|[ ]?[A-Z]{2,4})?)?|\d{2}:\d{2}(?::\d{2})?(?:\.\d+)?(?:[ ]?[AP]M)?|\b\d{10,13}\b|[A-Za-z]{3}[ ]?\d{1,2}(?:,? ?\d{2,4})?[ ]?\d{2}:\d{2}(?::\d{2})?|\[\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2}[ ][+-]\d{4}\])'
    log_level_pattern = r'\b(INFO|ERROR|DEBUG|WARN|WARNING|TRACE|FATAL)\b'
    timestamp_match = re.search(timestamp_pattern, line)
    log_level_match = re.search(log_level_pattern, line, flags=re.IGNORECASE)
    timestamp = timestamp_match.group(1) if timestamp_match else None
    log_level = log_level_match.group(1).upper() if log_level_match else None
    return timestamp, log_level

# Extract data using regex + SVM fallback
extracted = []

for line in lines:
    timestamp, log_level = extract_with_regex(line)
    if timestamp and log_level:
        extracted.append({'line': line.strip(), 'timestamp': timestamp, 'log_level': log_level})
    else:
        tokens = line.strip().split()
        for token in tokens:
            x = vectorizer.transform([token])
            prob = model.predict_proba(x)[0, 1]
            if prob > 0.5:
                extracted.append({'line': line.strip(), 'timestamp': token, 'log_level': log_level or 'UNKNOWN'})
                break

# Save and download
df = pd.DataFrame(extracted)
output_csv = filename.rsplit(".", 1)[0] + "_parsed_logs.csv"
df.to_csv(output_csv, index=False)
files.download(output_csv)

# Preview
print("\nPreview of extracted logs (timestamp + log level):")
print(df.head(10))

Extractor v2.0 (Regex + SVM Edition) =====
This tool first tries Regex and then uses SVM to extract timestamps from log files.
Running in Google Colab environment. File upload enabled.
Do you want to download the LogHub dataset for enhanced training? (y/n, default: n): y
Downloading LogHub dataset...
Cloning into 'loghub'...
remote: Enumerating objects: 579, done.[K
remote: Counting objects: 100% (178/178), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 579 (delta 149), reused 138 (delta 135), pack-reused 401 (from 2)[K
Receiving objects: 100% (579/579), 7.27 MiB | 5.66 MiB/s, done.
Resolving deltas: 100% (270/270), done.
Available log datasets: ['HealthApp', 'Thunderbird', 'Spark', 'Hadoop', 'OpenSSH', 'Zookeeper', '.git', 'HDFS', 'Apache', 'Proxifier', 'Windows', 'Mac', 'BGL', 'Android', 'HPC', 'Linux', '.github', 'OpenStack']


Processing datasets: 100%|██████████| 18/18 [00:00<00:00, 19.77it/s]


Created combined dataset with 120960 lines
Training new SVM model...
Model trained and saved.

===== Upload Your Log File =====


Saving combined_logs.log to combined_logs (1).log
Successfully uploaded: combined_logs (1).log (112780 bytes)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Preview of extracted logs (timestamp + log level):
                                                line            timestamp  \
0  2015-10-18 18:09:42,479 INFO [RMCommunicator A...  2015-10-18 18:09:42   
1  2015-10-18 18:10:50,155 WARN [LeaseRenewer:msr...  2015-10-18 18:10:50   
2  2015-10-18 18:10:18,012 WARN [LeaseRenewer:msr...  2015-10-18 18:10:18   
3  2015-10-18 18:05:47,680 WARN [LeaseRenewer:msr...  2015-10-18 18:05:47   
4  2015-10-18 18:10:02,511 ERROR [RMCommunicator ...  2015-10-18 18:10:02   
5  2015-10-18 18:06:28,108 INFO [RMCommunicator A...  2015-10-18 18:06:28   
6  2015-10-18 18:02:15,949 INFO [RMCommunicator A...  2015-10-18 18:02:15   
7  2015-10-18 18:04:08,065 INFO [ContainerLaunche...  2015-10-18 18:04:08   
8  2015-10-18 18:01:58,963 INFO [AsyncDispatcher ...  2015-10-18 18:01:58   
9  2015-10-18 18:03:12,874 INFO [IPC Server handl...  2015-10-18 18:03:12   

  log_level  
0      INFO  
1      WARN  
2      WARN  
3      WARN  
4     ERROR  
5      INFO  
6 