In [1]:
text = """
 ------------- Operating the heater shaker------------

Setting Target Temperature of Heater-Shaker to 37 °C
Waiting for Heater-Shaker to reach target temperature
Setting Heater-Shaker to Shake at 200 RPM and waiting until reached
Delaying for 60 minutes and 0.0 seconds
Deactivating Heater


"""

In [2]:
import re
from collections import defaultdict

MODULE_START_PATTERNS = [
    r"Setting Target Temperature of Heater-Shaker"   # only split on heater‑shaker for now
]

# Input: Multiline protocol text
# with open("/mnt/data/opentrons_protocol.txt", "r", encoding="utf-8") as file:
#     lines = file.readlines()
text_ = text.replace("\n        ", ";")
lines = text_.strip().split('\n')

# Strip lines, ignore indented lines (children/substeps), and filter empty lines
steps = [line.replace(";", "\n        ").strip() for line in lines if line.strip() and not line.startswith("        ") and not line.startswith("~~") and not "--" in line and not line.endswith(":")]

# Define prepositions to split on
PREPOSITIONS = [' from ', ' to ', ' on ', ' of ', ' into ']

# Structure for collecting parsed results
parsed_steps = []

# Parse each line
for line in steps:
    tokens = [line]
    for prep in PREPOSITIONS:
        new_tokens = []
        for token in tokens:
            new_tokens.extend(token.split(prep))
        tokens = new_tokens
    parsed_steps.append({
        "raw": line,
        "tokens": [t.strip() for t in tokens if t.strip()]
    })

# Group steps into phases based on "Aspirating"
grouped_phases = []
current_phase = []
aspirating_seen = False

In [None]:

last = ""
for step in parsed_steps:
    if (step["raw"].startswith("Aspirating") and not ("Picking up tip" in last) and not ("Moving to" in last) and not ("Transferring" in last)) \
        or ("Picking up tip" in step["raw"]) or ("Moving to" in step["raw"] and not ("Picking up tip" in last)):
        if aspirating_seen:
            grouped_phases.append(current_phase)
            current_phase = []
        aspirating_seen = True
    last = step["raw"]
    current_phase.append(step["raw"])

# Append the last batch
if current_phase:
    grouped_phases.append(current_phase)

# Output the grouped phases
import pandas as pd

grouped_dict = {"Phase {}".format(i + 1): phase for i, phase in enumerate(grouped_phases)}
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in grouped_dict.items() ]))

# Also, extract all distinct actions (first word in each step)
actions = set()
for step in parsed_steps:
    action = step["raw"].split()[0]
    actions.add(action)

sorted_actions = sorted(actions)
sorted_actions