In [26]:
import json
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq

In [11]:
smarts_dirpath = Path("../data/raw/torsion_smarts_data")

In [12]:
datafiles = smarts_dirpath.glob("*.txt")

In [13]:
def parse_angle_info(data: dict) -> dict:
    """
    Parse the angle information from the data dictionary.
    """
    angle_info = {}
    for smarts, angles_str in data.items():
        angle_info[smarts] = [int(angle) for angle in angles_str[1:-1].split(", ")]
    return angle_info

In [None]:
data = {}
for file in datafiles:
    with file.open("r") as f:
        file_data = json.load(f)
        filename = file.stem
        if filename.startswith("torsionPreferences_v2_"):
            filename = filename.replace("torsionPreferences_v2_", "")
        else:
            filename = filename.replace("torsionPreferences_", "")
        data[filename] = parse_angle_info(file_data)

In [15]:
data

{'smarts_bounds_macrocycles': {'[C:1][C;r{9-}:2](=O)@;-[NX3H0;r:3][CX4H1:4]': [90,
   270],
  '[C:1][C;r{9-}:2](=O)@;-[NX3H1;r:3][CX4H1:4]': [90, 270],
  '[C:1][C;r{9-}:2](=O)@;-[NX3H0;r:3][CX4H2:4]': [90, 270],
  '[O:1]=[C;r{9-}:2]@;-[O;r{9-}:3]~[CH0:4]': [180],
  '[O:1]=[C;r{9-}:2]([N])@;-[O;r{9-}:3]~[C:4]': [180],
  '[O:1]=[C;r{9-}:2]@;-[O;r{9-}:3]~[C:4]': [180],
  '[O:1]=[C;r{9-}:2]@;-[O;r{9-}:3]~[!#1:4]': [180],
  '[$(C=O):1][O;r{9-}:2]@;-[c;r{9-}:3]~[*:4]': [0, 180],
  '[$(C=O):1][O;r{9-}:2]@;-[CX3;r{9-}:3]~[*:4]': [0, 120, 240],
  '[$(C=O):1][O;r{9-}:2]@;-[CH1;r{9-}:3][H:4]': [180],
  '[$(C=O):1][O;r{9-}:2]@;-[CH2;r{9-}:3]~[C:4]': [0, 120, 240],
  '[H:1][CX4H1;r{9-}:2]@;-[O;r{9-}:3][CX4:4]': [180],
  '[C:1][CH2;r{9-}:2]@;-[O;r{9-}:3][CX4:4]': [0, 120, 240],
  '[*:1][CX4;r{9-}:2]@;-[O;r{9-}:3][$([CX3](=[!O])):4]': [0],
  '[O:1][CX4;r{9-}:2]@;-[O;r{9-}:3][CX4:4]': [0, 120, 240],
  '[*:1][CX4;r{9-}:2]@;-[O;r{9-}:3][CX4:4]': [0, 120, 240],
  '[cH1:1][c;r{9-}:2]([cH1])@;-[O;r{9-}:3][

In [16]:
data.keys()

dict_keys(['smarts_bounds_macrocycles', 'smarts_bounds', 'fallback_smarts_bounds', 'smarts_bounds_smallrings'])

In [17]:
typedict = {
    "smarts_bounds_macrocycles": "macrocycle",
    "smarts_bounds": "general",
    "fallback_smarts_bounds": "fallback",
    "smarts_bounds_smallrings": "smallring",
}

In [22]:
pylist = []

for filename, angles_dicts in data.items():
    temp = []
    for smarts, angles in angles_dicts.items():
        temp.append({"smarts": smarts, "angles": angles, "type": typedict[filename], "multipicity": len(angles) if angles else 0})
    pylist.extend(temp)

In [23]:
schema = pa.schema(
    [
        pa.field("smarts", pa.string()),
        pa.field("angles", pa.list_(pa.uint16())),
        pa.field("type", pa.string()),
        pa.field("multipicity", pa.uint8()),
    ]
)

print(schema)

smarts: string
angles: list<item: uint16>
  child 0, item: uint16
type: string
multipicity: uint8


In [24]:
table = pa.Table.from_pylist(pylist, schema=schema)

In [25]:
pq.write_table(table, smarts_dirpath / "torsion_smarts.parquet")