In [1]:
!pip install --user pubchempy

Collecting pubchempy
  Using cached pubchempy-1.0.5-py3-none-any.whl.metadata (4.3 kB)
Using cached pubchempy-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.5


In [2]:
!pip install tqdm



In [3]:
import sys
!{sys.executable} -m pip install --user pubchempy

Collecting pubchempy
  Using cached pubchempy-1.0.5-py3-none-any.whl.metadata (4.3 kB)
Using cached pubchempy-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.5


In [4]:
import site
print(site.getusersitepackages())

/u/ahernandez9/.local/lib/python3.11/site-packages


In [5]:
import sys, site
sys.path.append(site.getusersitepackages())

In [6]:
!pip install --user nistchempy

Collecting nistchempy
  Using cached nistchempy-1.0.5-py3-none-any.whl.metadata (5.2 kB)
Collecting bs4 (from nistchempy)
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Using cached nistchempy-1.0.5-py3-none-any.whl (11.4 MB)
Using cached bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, nistchempy
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nistchempy]2[0m [nistchempy]
[1A[2KSuccessfully installed bs4-0.0.2 nistchempy-1.0.5


In [7]:
import pubchempy as pcp

In [8]:
import random
import pandas as pd
import pubchempy as pcp
from tqdm import tqdm

In [9]:
!pip install --user nistchempy

Collecting nistchempy
  Using cached nistchempy-1.0.5-py3-none-any.whl.metadata (5.2 kB)
Collecting bs4 (from nistchempy)
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Using cached nistchempy-1.0.5-py3-none-any.whl (11.4 MB)
Using cached bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, nistchempy
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nistchempy]2[0m [nistchempy]
[1A[2KSuccessfully installed bs4-0.0.2 nistchempy-1.0.5


In [13]:
import re
import time
import random
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import nistchempy as nist

N_MOLECULES = 500
N_POINTS = 800
WAVENUMBER_MIN = 350
WAVENUMBER_MAX = 4050
DELAY = 0.5
OUTPUT_FILE = "500mol.csv"

def parse_jdx(jdx_text):
    lines = jdx_text.splitlines()
    def get_header(key):
        for line in lines:
            if line.upper().startswith(f"##{key}="):
                return line.split("=", 1)[1].strip()
        return None
    xfactor = float(get_header("XFACTOR") or 1.0)
    yfactor = float(get_header("YFACTOR") or 1.0)
    firstx = float(get_header("FIRSTX") or 0.0)
    lastx = float(get_header("LASTX") or 0.0)
    npoints = int(get_header("NPOINTS") or 0)
    xydata_type = get_header("XYDATA") or ""
    x_vals, y_vals = [], []
    if "X++" in xydata_type.upper():
        in_data = False
        for line in lines:
            if "##XYDATA=" in line.upper():
                in_data = True
                continue
            if in_data:
                if line.startswith("##"):
                    break
                tokens = re.split(r"[\s,]+", line.strip())
                if len(tokens) < 2:
                    continue
                try: x0 = float(tokens[0]) * xfactor
                except ValueError: continue
                for tok in tokens[1:]:
                    try: y_vals.append(float(tok) * yfactor); x_vals.append(x0)
                    except ValueError: pass
        if npoints > 0 and y_vals:
            x_vals = np.linspace(firstx * xfactor, lastx * xfactor, len(y_vals))
        else: x_vals = np.array(x_vals)
        y_vals = np.array(y_vals)
    else:
        in_data = False
        for line in lines:
            if line.startswith("##PEAK") or line.startswith("##XYPOINTS"):
                in_data = True
                continue
            if in_data:
                if line.startswith("##"): break
                parts = re.split(r"[\s,;]+", line.strip())
                if len(parts) >= 2:
                    try: x_vals.append(float(parts[0]) * xfactor); y_vals.append(float(parts[1]) * yfactor)
                    except ValueError: pass
        x_vals = np.array(x_vals); y_vals = np.array(y_vals)
    if len(x_vals) < 2: return None, None
    if x_vals[0] > x_vals[-1]: x_vals, y_vals = x_vals[::-1], y_vals[::-1]
    y_vals = np.nan_to_num(y_vals, nan=0.0)
    return x_vals, y_vals

def get_ir_spectrum(nist_id):
    try:
        compound = nist.get_compound(nist_id)
        compound.get_ir_spectra()
    except Exception: return None, None, None
    if not getattr(compound, "ir_specs", None): return None, None, None
    spec = compound.ir_specs[0]
    if not getattr(spec, "jdx_text", None): return None, None, None
    x, y = parse_jdx(spec.jdx_text)
    if x is None or len(x) < 5: return None, None, None
    smiles = getattr(compound, "smiles", None)
    return x, y, smiles

def interpolate(x, y):
    x, idx = np.unique(x, return_index=True)
    y = y[idx]
    mask = (x >= WAVENUMBER_MIN) & (x <= WAVENUMBER_MAX)
    xc, yc = x[mask], y[mask]
    if len(xc) < 2: return None, None
    f = interp1d(xc, yc, kind="linear", bounds_error=False, fill_value=(yc[0], yc[-1]))
    xn = np.linspace(WAVENUMBER_MIN, WAVENUMBER_MAX, N_POINTS)
    return xn, f(xn)

def main():
    try:
        df_all = nist.get_all_data()
    except Exception: return
    df_ir = df_all.loc[~df_all["IR Spectrum"].isna()].reset_index(drop=True)
    if df_ir.empty: return
    df_sample = df_ir.sample(n=min(N_MOLECULES, len(df_ir)), random_state=42)
    rows = []
    for i, row in enumerate(df_sample.itertuples(), 1):
        nist_id, name, casrn = row.ID, row.name, getattr(row, "CAS Registry Number", "")
        x_raw, y_raw, smiles = get_ir_spectrum(nist_id)
        time.sleep(DELAY)
        if x_raw is None: continue
        xi, yi = interpolate(x_raw, y_raw)
        if xi is None: continue
        row_data = {"compound_name": name, "casrn": casrn, "smiles": smiles or ""}
        for j, value in enumerate(yi):
            row_data[f"point_{j+1}"] = round(float(value), 6)
        rows.append(row_data)
    df_out = pd.DataFrame(rows)
    df_out.to_csv(OUTPUT_FILE, index=False)

if __name__ == "__main__":
    main()

In [None]:
import rdkit 
import time 
import pandas as pd
import pubchempy as pcp

In [None]:
# input_csv_file = "200mol.csv"
# output_csv_file = "200mol_names.csv"

# df = pd.read_csv("200mol.csv")

# smiles_string_list = []

# for names in df["compound_name"]:
#     try:
#         compound_naming = pcp.get_compounds(names, "name")
#         if compound_naming:
#             smiles = compounds[0].isomeric_smiles
#         else:
#             smiles = ""
#     except Exception:
#         smiles = ""

#     smiles_string_list.append(smiles)
#     time.sleep(0.1)

# df["smiles"] = smiles_string_list 
# df.to_csv(output_csv_file, index=False)
# print("CSV naming file completed - proceed to the comp data")
    

In [52]:
# import pandas as pd
# import pubchempy as pcp
# import time
# import re

# INPUT_FILE = "200mol.csv"
# OUTPUT_FILE = "200mol_names.csv"

# df = pd.read_csv(INPUT_FILE)

# def clean_name(name):
#     name = str(name)

#     # Fix "Butane, 1-ol" → "1-ol Butane"
#     if "," in name:
#         parts = [p.strip() for p in name.split(",")]
#         if len(parts) == 2:
#             name = parts[1] + " " + parts[0]

#     # Remove double spaces
#     name = re.sub(r"\s+", " ", name)

#     return name.strip()


# smiles_list = []

# for name in df["compound_name"]:
#     cleaned = clean_name(name)

#     try:
#         compounds = pcp.get_compounds(cleaned, "name")
#         if compounds:
#             smiles = compounds[0].smiles
#         else:
#             smiles = ""
#     except Exception:
#         smiles = ""

#     smiles_list.append(smiles)
#     time.sleep(0.2)

# df["smiles"] = smiles_list
# df.to_csv(OUTPUT_FILE, index=False)

# print("CSV file created - proceed to the comp data")

CSV file created - proceed to the comp data


In [None]:
# import pandas as pd
# import nistchempy as nist
# import time

# INPUT_FILE = "200mol.csv"
# OUTPUT_FILE = "names_with_smiles.csv"

# df = pd.read_csv(INPUT_FILE)

# smiles_list = []

# for name in df["compound_name"]:
#     print(f"Fetching SMILES for: {name}")
#     smiles = ""

#     try:
#         # Search compounds by name
#         results = nist.search(name=name)
#         if results:
#             # Loop over results until we find a SMILES
#             for r in results:
#                 try:
#                     # Fetch full compound data
#                     compound = nist.get_compound(r.ID)
#                     if hasattr(compound, "smiles") and compound.smiles:
#                         smiles = compound.smiles
#                         break  # Stop at first valid SMILES
#                 except Exception:
#                     continue
#     except Exception:
#         pass

#     smiles_list.append(smiles)
#     time.sleep(0.2)  # polite delay

# df["smiles"] = smiles_list
# df.to_csv(OUTPUT_FILE, index=False)
# print(f"Done! Saved {len(df)} rows → {OUTPUT_FILE}")

In [None]:
# n_points = 250
# point_cols = [c for c in df.columns if re.match(r'pt\d+', c, re.IGNORECASE)]
# if len(point_cols) != n_points:
#     raise ValueError(f"Expected {N_POINTS} point columns, found {len(point_cols)}. "
#                      f"Columns detected: {point_cols}")