In [88]:
import pandas as pd
import ilthermopy as ilt
from rdkit import Chem

In [37]:
def collect_data(start_year: int, end_year: int) -> tuple:
    collector_df = pd.DataFrame(columns=["V1", "dV1"])
    smiles_list = []
    for year in range(start_year, end_year+1, 1):
        df = ilt.Search(n_compounds = 1, year = year, prop="Normal melting temperature")
        smiles_list.extend(df["cmp1_smiles"])
        data = [ilt.GetEntry(idx) for idx in df.id.iloc[:]]
        print(f"Collected data from {year}.")  # noqa: T201
        for entry in data:
            collector_df = pd.concat([collector_df, entry.data])
    return collector_df, smiles_list


In [None]:
def canonize_smiles(smiles:str) -> str:
    canon_smiles = Chem.CanonSmiles(smiles)
    return canon_smiles

In [39]:
collector_df, smiles_list = collect_data(2017, 2023)

Collected data from 2017.


  collector_df = pd.concat([collector_df, entry.data])


Collected data from 2018.
Collected data from 2019.
Collected data from 2020.
Collected data from 2021.
Collected data from 2022.
Collected data from 2023.


In [56]:
collector_df["smiles"] = smiles_list

In [79]:
collector_df = collector_df.dropna()

In [83]:
collector_df = collector_df.reset_index()

In [96]:
collector_df["smiles"] = collector_df["smiles"].apply(canonize_smiles)

In [100]:
collector_df = collector_df.drop(['index'], axis=1)

In [102]:
collector_df["V1"] = collector_df["V1"] - 273.15

In [105]:
collector_df.head()

Unnamed: 0,V1,dV1,smiles
0,11.15,0.4,CCCCCCCC[P+](CCCCCCCC)(CCCCCCCC)CCCCCCCC.O=S(=...
1,-10.95,0.4,CCCC[N+]1(C)CCCC1.O=S(=O)([N-]S(=O)(=O)C(F)(F)...
2,4.05,0.4,CCCCCC[N+]1(C)CCCC1.O=S(=O)([N-]S(=O)(=O)C(F)(...
3,4.2,0.41,CCCCCC[N+]1(C)CCCC1.O=S(=O)([N-]S(=O)(=O)C(F)(...
4,-13.15,0.4,CCCC[N+]1(C)CCCC1.O=S(=O)([N-]S(=O)(=O)C(F)(F)...


In [108]:
collector_df.rename(columns={"V1": "MP", "dV1": "error"}, inplace=True)

In [None]:
collector_df=collector_df.drop["error", axis=1]

In [111]:
collector_df.to_csv("ilthermo_mp_database.csv")