In [1]:
import json
from collections import Counter
from pathlib import Path

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
smarts_dirpath = Path("../data/raw/torsion_smarts_data")

In [3]:
datafiles = smarts_dirpath.glob("*.txt")

In [4]:
def parse_angle_info(data: dict) -> dict:
    """
    Parse the angle information from the data dictionary.
    """
    angle_info = {}
    for smarts, angles_str in data.items():
        angle_info[smarts] = [int(angle) for angle in angles_str[1:-1].split(", ")]
    return angle_info

In [5]:
data = {}
for file in datafiles:
    with file.open("r") as f:
        file_data = json.load(f)
        filename = file.stem
        if filename.startswith("torsionPreferences_v2_"):
            filename = filename.replace("torsionPreferences_v2_", "")
        else:
            filename = filename.replace("torsionPreferences_", "")
        data[filename] = parse_angle_info(file_data)

In [6]:
typedict = {
    "smarts_bounds_macrocycles": "macrocycle",
    "smarts_bounds": "general",
    "fallback_smarts_bounds": "fallback",
    "smarts_bounds_smallrings": "smallring",
}

In [7]:
pylist = []

for filename, angles_dicts in data.items():
    temp = []
    for smarts, angles in angles_dicts.items():
        temp.append({"smarts": smarts, "expected_angles": angles, "type": typedict[filename], "multiplicity": len(angles) if angles else 0})
    pylist.extend(temp)

In [8]:
schema = pa.schema(
    [
        pa.field("smarts", pa.string()),
        pa.field("expected_angles", pa.list_(pa.uint16())),
        pa.field("type", pa.string()),
        pa.field("multiplicity", pa.uint8()),
    ]
)

print(schema)

smarts: string
expected_angles: list<item: uint16>
  child 0, item: uint16
type: string
multiplicity: uint8


In [9]:
table = pa.Table.from_pylist(pylist, schema=schema)

In [10]:
pq.write_table(table, smarts_dirpath / "torsion_smarts.parquet")

In [11]:
df = table.to_pandas()

In [12]:
deltas = df["expected_angles"].apply(lambda x: np.diff(sorted(x)) if len(x) > 1 else [0])

In [13]:
df

Unnamed: 0,smarts,expected_angles,type,multiplicity
0,[C:1][C;r{9-}:2](=O)@;-[NX3H0;r:3][CX4H1:4],"[90, 270]",macrocycle,2
1,[C:1][C;r{9-}:2](=O)@;-[NX3H1;r:3][CX4H1:4],"[90, 270]",macrocycle,2
2,[C:1][C;r{9-}:2](=O)@;-[NX3H0;r:3][CX4H2:4],"[90, 270]",macrocycle,2
3,[O:1]=[C;r{9-}:2]@;-[O;r{9-}:3]~[CH0:4],[180],macrocycle,1
4,[O:1]=[C;r{9-}:2]([N])@;-[O;r{9-}:3]~[C:4],[180],macrocycle,1
...,...,...,...,...
833,"[!#1;r{5-8}:1]@[P;r5:2]@;-[S,P;r5:3]@[!#1;r{5-...","[45, 135, 225, 315]",smallring,4
834,"[!#1;r{5-8}:1]@[P;r{5-6}:2]@;-[S,P;r{5-8}:3]@[...","[0, 60, 120, 180, 240, 299]",smallring,6
835,"[!#1;r{5-8}:1]@[P;r{5-8}:2]@;-[S,P;r{5-8}:3]@[...","[0, 60, 120, 180, 240, 299]",smallring,6
836,[!#1;r{5-8}:1]@[P;r{5-8}:2]@;-[a;r{5-8}:3]@[!#...,"[30, 90, 149, 209, 270, 330]",smallring,6


In [14]:
flattened_deltas = [angle for sublist in deltas for angle in sublist]

In [15]:
delta_counts = Counter(flattened_deltas)

In [16]:
min_angle = min([delta for delta in delta_counts.keys() if delta != 0])

In [17]:
min_angle

23