In [16]:
# import re

# def parse_log_and_format(input_text):
#     # Regex patterns to extract numbers
#     patterns = {
#         "train_error": r"\[trn phase\]\n.*?error: ([\d.]+)",
#         "train_loss": r"\[trn phase\]\n.*?loss: ([\d.]+)",
#         "val_error": r"\[val phase\]\n.*?error: ([\d.]+)",
#         "val_loss": r"\[val phase\]\n.*?loss: ([\d.]+)",
#         "best_epoch": r"Best Epoch: (\d+)",
#         "train_mae": r"\[trn set\]\n.*?age \(mae\): ([\d.]+)",
#         "val_mae": r"\[val set\]\n.*?age \(mae\): ([\d.]+)",
#         "test_mae": r"\[tst set\]\n.*?age \(mae\): ([\d.]+)"
#     }

#     # Extract values using regex
#     results = {}
#     for key, pattern in patterns.items():
#         match = re.search(pattern, input_text, re.DOTALL)
#         results[key] = match.group(1) if match else "-"

#     # Format the output
#     output = (
#         f"{results['train_mae']}/{results['train_error']}, "
#         f"-/{results['train_loss']}, "
#         f"{results['val_mae']}/{results['val_error']}, "
#         f"-/{results['val_loss']}, "
#         f"{results['test_mae']}, "
#         f"{results['best_epoch']}/49"
#     )

#     return output

# # Input text
# log_text = """
# 2025-02-15 14:52:26,345 [INFO] Number of trn data 10954
# 2025-02-15 14:52:26,345 [INFO] Number of val data 3069
# 2025-02-15 14:52:26,737 [INFO] Epoch 49/49
# 2025-02-15 14:52:26,738 [INFO] ----------------------------------------
# 2025-02-15 14:52:45,733 [INFO] [trn phase]
# 2025-02-15 14:52:45,733 [INFO] error: 1.0416 age_error:1.0416
# 2025-02-15 14:52:45,733 [INFO] loss: 0.1545 age_loss:0.1545
# 2025-02-15 14:52:51,753 [INFO] [val phase]
# 2025-02-15 14:52:51,753 [INFO] error: 1.1720 age_error:1.1720
# 2025-02-15 14:52:51,753 [INFO] loss: 0.4443 age_loss:0.4443
# 2025-02-15 14:52:51,753 [INFO] Best Epoch: 46
# 2025-02-15 14:52:52,248 [INFO] Checkpoint saved to facebase/results/Adience_256x256_resnet50_imagenet_noisy_dldl_v2/split0/checkpoint_49.pth
# 2025-02-15 14:52:52,935 [INFO] Training complete in 0m 27s
# 2025-02-15 14:52:52,935 [INFO] Best epoch: 46
# 2025-02-15 14:53:21,847 [INFO] Model evalution:
# 2025-02-15 14:53:21,847 [INFO] [trn set]
# 2025-02-15 14:53:21,847 [INFO] age (mae): 0.1308
# 2025-02-15 14:53:21,847 [INFO] [val set]
# 2025-02-15 14:53:21,847 [INFO] age (mae): 0.1887
# 2025-02-15 14:53:21,847 [INFO] [tst set]
# 2025-02-15 14:53:21,847 [INFO] age (mae): 0.6852
# """

# # Parse and format the log
# formatted_output = parse_log_and_format(log_text)
# print(formatted_output)

In [17]:
import os
import re

def extract_metrics_from_log(root_folder):
    results = []
    
    for i in range(5):  # Iterate through split0 to split4
        log_path = os.path.join(root_folder, f'split{i}', 'training.log')
        
        if not os.path.exists(log_path):
            print(f"Warning: {log_path} does not exist.")
            continue
        
        with open(log_path, 'r') as file:
            lines = file.readlines()
        
        if len(lines) < 21:
            print(f"Warning: {log_path} has less than 21 lines.")
            continue
        
        last_lines = lines[-21:]

        # Initialize variables
        trn_mae = val_mae = tst_mae = None
        trn_err = val_err = None
        trn_loss = val_loss = None
        best_epoch = total_epochs = None
        
        for j, line in enumerate(last_lines):
            # Extract MAE values for training, validation, and test sets
            if "[INFO] [trn set]" in line:
                trn_mae = float(re.search(r"[-+]?\d*\.\d+", last_lines[j + 1]).group())
            elif "[INFO] [val set]" in line:
                val_mae = float(re.search(r"[-+]?\d*\.\d+", last_lines[j + 1]).group())
            elif "[INFO] [tst set]" in line:
                tst_mae = float(re.search(r"[-+]?\d*\.\d+", last_lines[j + 1]).group())

            # Extract error and loss values for training phase
            elif "[INFO] [trn phase]" in line:
                # print(i, line)
                trn_err = float(re.search(r"error: ([\d.]+)", last_lines[j + 1]).group(1))
                trn_loss = float(re.search(r"loss: ([\d.]+)", last_lines[j + 2]).group(1))

            # Extract error and loss values for validation phase
            elif "[INFO] [val phase]" in line:
                val_err = float(re.search(r"error: ([\d.]+)", last_lines[j + 1]).group(1))
                val_loss = float(re.search(r"loss: ([\d.]+)", last_lines[j + 2]).group(1))

            # Extract best epoch (correcting the issue)
            elif "[INFO] Best Epoch:" in line:
                best_epoch = int(line.split()[-1])  # Take the **last** element, ignoring timestamp

            # Extract total epochs
            elif "[INFO] Epoch" in line:
                # print(i, line)
                match = re.search(r"Epoch (\d+)/(\d+)", line)
                if match:
                    total_epochs = int(match.group(2))

        if None in [trn_mae, val_mae, tst_mae, trn_err, val_err, trn_loss, val_loss, best_epoch, total_epochs]:
            print(f"Warning: Missing data in {log_path}")
            continue

        result_line = (
            f"{trn_mae}/{trn_err}, -/{trn_loss}, "
            f"{val_mae}/{val_err}, -/{val_loss}, "
            f"{tst_mae}, {best_epoch}/{total_epochs}"
        )
        results.append(result_line)
    
    return("\n".join(results))

In [18]:
import numpy as np

def parse_and_calculate(input_text):
    # Split input text into rows and then into individual numbers
    rows = input_text.strip().split("\n")
    data = []
    for row in rows:
        # Split by tab, then further split by '/' and remove '-'
        parsed_row = []
        for item in row.split("\t"):
            parts = item.split("/")
            for part in parts:
                if part != "-":
                    parsed_row.append(float(part))
        data.append(parsed_row)
    
    # Convert to numpy array for easier calculations
    data = np.array(data)

    # Calculate mean and variance for each column
    means = np.mean(data, axis=0)
    variances = np.sqrt(np.var(data, axis=0))
    # Format the output with 4-digit precision
    output = (
        f"{means[0]:.4f}±{variances[0]:.4f}/{means[1]:.4f}±{variances[1]:.4f}, "
        f"-/{means[2]:.4f}±{variances[2]:.4f}, "
        f"{means[3]:.4f}±{variances[3]:.4f}/{means[4]:.4f}±{variances[4]:.4f}, "
        f"-/{means[5]:.4f}±{variances[5]:.4f}, "
        f"{means[6]:.4f}±{variances[6]:.4f}, "
        f"{means[7]:.4f}±{variances[7]:.4f}/{int(means[8])}"
    )
    return output

# # Input text
# input_text = """
# 2.2147/0.1539	-/0.3586	3.9127/3.7393	-/7.7739	0.7961	16/49
# 2.2468/0.1155	-/0.2987	2.1100/2.9188	-/6.7673	0.6471	14/49
# 2.8323/0.1059	-/0.2732	2.7364/3.1347	-/6.8584	0.7042	13/49
# 2.3817/0.1371	-/0.3173	2.6194/3.6016	-/8.6991	0.763	14/49
# 2.9028/0.1866	-/0.3942	1.3841/2.6257	-/7.1335	0.7087	14/49
# """
# print(input_text)
# # Calculate and print the result
# result = parse_and_calculate(input_text)
# print(result)

In [19]:
root_path = '../facebase/results/Adience_256x256_resnet50_imagenet_noisy_dldl_v2_clean_corrected'
split_metrics = extract_metrics_from_log(root_path)
print(split_metrics, end='\n')

0.0477/0.3701, -/0.0708, 0.288/0.4695, -/0.5984, 0.4202, 17/49
0.0445/0.3915, -/0.0716, 0.3097/0.4188, -/0.6323, 0.4246, 23/49
0.0481/0.3615, -/0.0695, 0.4087/0.5684, -/0.8321, 0.36, 16/49
0.0582/0.3783, -/0.0687, 0.2571/0.3728, -/0.496, 0.5095, 14/49
0.0659/0.4163, -/0.0689, 0.3069/0.5088, -/0.7122, 0.3443, 10/49


In [20]:
split_metrics = split_metrics.replace(', ', '\t')
print(parse_and_calculate(split_metrics))

0.0529±0.0080/0.3835±0.0191, -/0.0699±0.0011, 0.3141±0.0509/0.4677±0.0682, -/0.6542±0.1128, 0.4117±0.0583, 16.0000±4.2426/49
