In [14]:
import re
import pandas as pd

class ProteinTimeParser:
    def __init__(self, log_file, method):
        self.log_file = log_file
        self.method = method
        self.parsed_data = []

    def parse(self):
        """Parses the log file and extracts only 'INFO' log entries for proteins and their execution times."""
        with open(self.log_file, 'r') as file:
            for line in file:
                parsed_entry = self._parse_line(line.strip())
                if parsed_entry:
                    self.parsed_data.append(parsed_entry)

    def _parse_line(self, line):
        """Parses a single log line and extracts protein name and time taken."""
        if self.method == 'diffdock':
            info_pattern = re.compile(
                r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - INFO - '
                r'(?P<protein_id>\S+) took (?P<exec_time>\d+\.\d+) seconds'
            )
        elif self.method in ('gnina', 'vina'):
            info_pattern = re.compile(
                r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) \[INFO\] '
                r'(?P<protein_id>\S+),(?P<exec_time>\d+\.\d+)'
            )
        else:
            raise ValueError(f"Unsupported method: {self.method}")

        match = info_pattern.match(line)
        if match:
            return match.groupdict()
        return None

    def to_dataframe(self):
        """Converts extracted protein execution times into a Pandas DataFrame."""
        df = pd.DataFrame(self.parsed_data)
        df["exec_time"] = df["exec_time"].astype(float)  # Convert time to float
        return df


In [12]:
import os 
os.getcwd()

'/Users/aoxu/projects/DrugDiscovery/PoseBench/notebooks'

In [20]:
for method in ['diffdock', 'gnina', 'vina']:
    log_file = f"../analysis/{method}_timing.log"  # Replace with the actual log file path
    parser = ProteinTimeParser(log_file, method)
    parser.parse()
    df = parser.to_dataframe()
    print(f"Method: {method}")
    print(df.describe())

Method: diffdock
        exec_time
count  259.000000
mean    48.478726
std     16.387655
min     14.340000
25%     37.500000
50%     45.670000
75%     58.590000
max     97.060000
Method: gnina
         exec_time
count  1284.000000
mean      6.203466
std       6.786108
min       0.250000
25%       3.070000
50%       4.510000
75%       6.790000
max      93.740000
Method: vina
        exec_time
count  428.000000
mean     6.128668
std      9.268926
min      0.250000
25%      1.390000
50%      3.055000
75%      6.315000
max     93.740000


In [18]:
log_file = "../analysis/gnina_timing.log"  # Replace with the actual log file path
parser = ProteinTimeParser(log_file, 'gnina')
parser.parse()
df = parser.to_dataframe()
df 

Unnamed: 0,timestamp,protein_id,exec_time
0,"2025-02-03 08:28:31,214",7ZZW_KKW,6.26
1,"2025-02-03 08:28:31,214",7ZZW_KKW,6.26
2,"2025-02-03 08:28:52,382",7MAE_XUS,21.17
3,"2025-02-03 08:28:52,382",7MAE_XUS,21.17
4,"2025-02-03 08:29:00,173",7MEU_MGP,7.79
...,...,...,...
1279,"2025-02-03 09:55:51,920",7JHQ_VAJ,1.76
1280,"2025-02-03 09:56:04,717",8FO5_Y4U,12.80
1281,"2025-02-03 09:56:06,828",8CSD_C5P,2.11
1282,"2025-02-03 09:56:20,329",8EX2_Q2Q,13.50


In [21]:
log_file = "../analysis/vina_timing.log"  # Replace with the actual log file path
parser = ProteinTimeParser(log_file, 'vina')
parser.parse()
df = parser.to_dataframe()
df 

Unnamed: 0,timestamp,protein_id,exec_time
0,"2025-02-03 09:13:00,731",7ZZW_KKW,4.13
1,"2025-02-03 09:13:03,896",7MAE_XUS,3.16
2,"2025-02-03 09:13:15,365",7MEU_MGP,11.47
3,"2025-02-03 09:13:15,764",7A9E_R4W,0.40
4,"2025-02-03 09:13:17,533",7SFO_98L,1.77
...,...,...,...
423,"2025-02-03 09:55:51,920",7JHQ_VAJ,1.76
424,"2025-02-03 09:56:04,717",8FO5_Y4U,12.80
425,"2025-02-03 09:56:06,828",8CSD_C5P,2.11
426,"2025-02-03 09:56:20,329",8EX2_Q2Q,13.50
