In [2]:
import re

import pandas as pd


def parse_training_log(file):
    log_data = {
        'epoch': [],
        'type': [],
        'metric': [],
        'value': []
    }

    for line in file:
        if "Training epoch" in line:
            epoch = int(re.search(r"epoch (\d+)", line).group(1))
        elif "Epoch" in line and "on test dataset" in line:
            test_data = re.search(
                r"Epoch \d+ on test dataset: MRR: ([0-9.]+); Recalls: tensor\(\[([0-9., ]+)\]\) Loss: ([0-9.]+)positive-loss: ([0-9.]+); negative-loss: ([0-9.]+)",
                line)
            if test_data:
                mrr, recalls, loss, pos_loss, neg_loss = test_data.groups()
                recalls = recalls.split(', ')
                log_data['epoch'].extend([epoch] * 5)
                log_data['type'].extend(['test'] * 5)
                log_data['metric'].extend(['MRR', 'Recall@1', 'Recall@5', 'Recall@10', 'Loss'])
                log_data['value'].extend(
                    [float(mrr), float(recalls[0]), float(recalls[1]), float(recalls[2]), float(loss)])
        elif "Training batch" in line:
            batch_data = re.search(r"Training batch (\d+)/\d+ loss: ([0-9.]+)", line)
            if batch_data:
                batch, loss = batch_data.groups()
                log_data['epoch'].append(epoch)
                log_data['type'].append('batch')
                log_data['metric'].append(f'Batch {batch} loss')
                log_data['value'].append(float(loss))
        elif "Epoch" in line and "on training dataset" in line:
            train_data = re.search(
                r"Epoch \d+ on training dataset: loss: ([0-9.]+); positive-loss: ([0-9.]+); negative-loss: ([0-9.]+)",
                line)
            if train_data:
                total_loss, pos_loss, neg_loss = train_data.groups()
                log_data['epoch'].extend([epoch] * 3)
                log_data['type'].extend(['train'] * 3)
                log_data['metric'].extend(['Total Loss', 'Positive Loss', 'Negative Loss'])
                log_data['value'].extend([float(total_loss), float(pos_loss), float(neg_loss)])

    return pd.DataFrame(log_data)


# Example usage:
with open('../logs/log_2024-06-01_14-57-18.log', 'r') as f:
    df = parse_training_log(f)

df


Unnamed: 0,epoch,type,metric,value
0,0,test,MRR,0.450000
1,0,test,Recall@1,0.319700
2,0,test,Recall@5,0.594400
3,0,test,Recall@10,0.695300
4,0,test,Loss,50287.750000
...,...,...,...,...
685,29,batch,Batch 6500 loss,0.119775
686,29,batch,Batch 7000 loss,0.069609
687,29,train,Total Loss,3164.269338
688,29,train,Positive Loss,2799.360000
