In [2]:
import re
import pandas as pd

class ProteinTimeParser:
    def __init__(self, log_file):
        self.log_file = log_file
        self.parsed_data = []

    def parse(self):
        """Parses the log file and extracts only 'INFO' log entries for proteins and their execution times."""
        with open(self.log_file, 'r') as file:
            for line in file:
                parsed_entry = self._parse_line(line.strip())
                if parsed_entry:
                    self.parsed_data.append(parsed_entry)

    def _parse_line(self, line):
        """Parses a single log line and extracts protein name and time taken."""
        if self.method == 'diffdock':
            info_pattern = re.compile(
                r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - INFO - '
                r'(?P<protein_id>\S+) took (?P<exec_time>\d+\.\d+) seconds'
            )
        elif self.method in ('gnina', 'vina'):
            info_pattern = re.compile(
                r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) \[INFO\] '
                r'(?P<protein_id>\S+),(?P<exec_time>\d+\.\d+)'
            )
        else:
            raise ValueError(f"Unsupported method: {self.method}")

        match = info_pattern.match(line)
        if match:
            return match.groupdict()
        return None

    def to_dataframe(self):
        """Converts extracted protein execution times into a Pandas DataFrame."""
        df = pd.DataFrame(self.parsed_data)
        df["exec_time"] = df["exec_time"].astype(float)  # Convert time to float
        return df


In [2]:
import os 
os.getcwd()

'/Users/aoxu/projects/DrugDiscovery/PoseBench/notebooks'

In [3]:

# Usage Example
log_file = "../analysis/diffdock_timing.log"  # Replace with the actual log file path
parser = ProteinTimeParser(log_file)
parser.parse()
df = parser.to_dataframe()
df 
# Display parsed data
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Protein Execution Times", dataframe=df)

Unnamed: 0,timestamp,protein_id,exec_time
0,"2025-02-02 08:07:04,741",7ZZW_KKW,66.89
1,"2025-02-02 08:07:28,708",7A9E_R4W,16.01
2,"2025-02-02 08:08:16,486",7SFO_98L,39.97
3,"2025-02-02 08:09:08,780",7U3J_L6U,45.45
4,"2025-02-02 08:10:02,860",7OPG_06N,47.12
...,...,...,...
254,"2025-02-02 12:04:58,780",7OFF_VCB,44.69
255,"2025-02-02 12:06:10,401",6ZK5_IMH,64.17
256,"2025-02-02 12:07:25,956",7R3D_APR,68.18
257,"2025-02-02 12:08:11,371",8FO5_Y4U,38.80


In [5]:
df.describe()

Unnamed: 0,exec_time
count,259.0
mean,48.478726
std,16.387655
min,14.34
25%,37.5
50%,45.67
75%,58.59
max,97.06


In [3]:
log_file = "../analysis/gnina_timing.log"  # Replace with the actual log file path
parser = ProteinTimeParser(log_file)
parser.parse()
df = parser.to_dataframe()
df 

KeyError: 'exec_time'