In [2]:
import pandas as pd
import requests
from time import sleep

# CRC data

In [4]:
df = pd.read_csv("../data/alkanes_CRC.csv",sep=";")

In [5]:
df.head()

Unnamed: 0,name,boiling_point,SMILES,IUPAC_name,CID,molecular_formula
0,Butane,-0.5,CCCC,butane,7843,C4H10
1,Dodecane,216.3,CCCCCCCCCCCC,dodecane,8182,C12H26
2,Propane,-42.1,CCC,propane,6334,C3H8
3,Ethane,-88.6,CC,ethane,6324,C2H6
4,5-Butyldocosane,244.0,CCCCCCCCCCCCCCCCCC(CCCC)CCCC,5-butyldocosane,143267,C26H54


In [6]:
pugrest_prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
pugrest_operation = "property/CanonicalSMILES,MolecularFormula,IUPACName"
pugrest_output = "json"

smiles = []
IUPACs = []
formulas = []
CIDs = []

for i in range(df.shape[0]):
    pugrest_input = "compound/name/" + df.iloc[i]["name"]
    pugrest_url = "/".join( (pugrest_prolog, pugrest_input, pugrest_operation, pugrest_output))
    res = requests.get(pugrest_url).json()
    
    smiles.append(res["PropertyTable"]["Properties"][0]["CanonicalSMILES"])
    IUPACs.append(res["PropertyTable"]["Properties"][0]["IUPACName"])
    formulas.append(res["PropertyTable"]["Properties"][0]["MolecularFormula"])
    CIDs.append(res["PropertyTable"]["Properties"][0]["CID"])

    sleep(1)

print(smiles)
print(IUPACs)
print(formulas)
print(CIDs)
    

['CCCC', 'CCCCCCCCCCCC', 'CCC', 'CC', 'CCCCCCCCCCCCCCCCCC(CCCC)CCCC', 'CCCCCCCCCCCC(CCCC)CCCCCCCCCC', 'CCCCCCCCCCC(CCCCCCCCCC)CCCCCCCCCC', 'CCC(CC)C(CC)CC', 'CCC(CC)(CC)CC', 'CC(C)C(C)C', 'CCC(C)(C)C', 'CCCCCC(C)(C)C', 'CCCCC(C)C(C)C', 'CCCC(C)CC(C)C', 'CCC(C)CCC(C)C', 'CC(C)CCCC(C)C', 'CCCCC(C)(C)CC', 'CCCC(C)C(C)CC', 'CCC(C)CC(C)CC', 'CCCC(C)(C)CCC', 'CCCCC(C)(C)C', 'CCCC(C)C(C)C', 'CCC(C)CC(C)C', 'CC(C)CCC(C)C', 'CCCC(C)(C)CC', 'CCC(C)C(C)CC', 'CC(C)C(C(C)C)C(C)C', 'CCCCCCC(C)(C)C', 'CCCCCC(C)C(C)C', 'CCCCC(C)CC(C)C', 'CCCC(C)CCC(C)C', 'CCC(C)CCCC(C)C', 'CC(C)CCCCC(C)C', 'CCCCC(C)C(C)CC', 'CCC(C)CCC(C)CC', 'CCCC(C)(C)C', 'CCC(C)C(C)C', 'CC(C)CC(C)C', 'CCC(C)(C)CC', 'CCCCCCCCCCCCCCCCCCCCCC', 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC', 'CCCCCCCCCCCCCCCCCCCC', 'CCC(CC)C(C)(C)C', 'CCC(C)(CC)C(C)C', 'CCC(C(C)C)C(C)C', 'CCCCC(CC)CC', 'CCCC(CC)CCC', 'CCCC(CC)CC', 'CCCC(CC)C(C)C', 'CCCC(C)(CC)CC', 'CCC(C)C(CC)CC', 'CCC(CC)CC(C)C', 'CCC(CC)C(C)C', 'CCC(C)(CC)CC', 'CCCCCC(CC)CC', 'CCCCC(CC)CCC', 'CCC

In [7]:
df["SMILES"] = smiles
df["IUPAC_name"] = IUPACs
df["CID"] = CIDs
df["molecular_formula"] = formulas

df.head()

Unnamed: 0,name,boiling_point,SMILES,IUPAC_name,CID,molecular_formula
0,Butane,-0.5,CCCC,butane,7843,C4H10
1,Dodecane,216.3,CCCCCCCCCCCC,dodecane,8182,C12H26
2,Propane,-42.1,CCC,propane,6334,C3H8
3,Ethane,-88.6,CC,ethane,6324,C2H6
4,5-Butyldocosane,244.0,CCCCCCCCCCCCCCCCCC(CCCC)CCCC,5-butyldocosane,143267,C26H54


In [8]:
df.to_csv("../data/alkanes_CRC.csv", sep=";",columns=["name","boiling_point","SMILES","IUPAC_name","CID","molecular_formula"],index=False)

# Wiener's data

In [10]:
with open("../data/wiener_table_ii.txt", "r") as file:
    data = [line.strip().replace(' .', ' 0.').replace(' -.',' -0.').split() for line in file]

df_w_ii = pd.DataFrame(data)
df_w_ii = df_w_ii.drop(columns=[6])
df_w_ii[[1,2,3,4,5]]= df_w_ii[[1,2,3,4,5]].astype(float)

df_w_ii.rename(columns={
    0: 'name',
    1: 'delta_bp',
    2: 'delta_w',
    3: 'delta_p',
    4: 'delta_bp_calc',
    5: 'dev'
}, inplace=True)

print(df_w_ii.head())


                  name  delta_bp  delta_w  delta_p  delta_bp_calc  dev
0             n-Butane       0.0      0.0      0.0            0.0  0.0
1      2-Methylpropane      11.2      1.0      1.0           11.6 -0.4
2            n-Pentane       0.0      0.0      0.0            0.0  0.0
3       2-Methylbutane       8.2      2.0      0.0            7.9  0.3
4  2,2-Dimethylpropane      26.6      4.0      2.0           26.7 -0.1


In [12]:
with open("../data/wiener_table_iii.txt", "r") as file:
    data = [line.strip().replace(' .', ' 0.').replace(' -.',' -0.').split() for line in file]

df_w_iii = pd.DataFrame(data)
df_w_iii = df_w_iii.drop(columns=[6,7])
df_w_iii[[1,2,3,4,5]]= df_w_iii[[1,2,3,4,5]].astype(float)

df_w_iii.rename(columns={
    0: 'name',
    1: 'bp',
    2: 'delta_w',
    3: 'delta_p',
    4: 'bp_calc',
    5: 'dev'
}, inplace=True)

print(df_w_iii.head())


             name     bp  delta_w  delta_p  bp_calc  dev
0        n-Nonane  150.8      0.0      0.0    150.8  0.0
1  2-Methyloctane  143.3      6.0      0.0    143.5  0.2
2  3-Methyloctane  144.2     10.0     -1.0    144.2  0.0
3  4-MethyIoctane  142.5     12.0     -1.0    141.8 -0.7
4  3-Ethylheptane  143.0     16.0     -2.0    142.5 -0.5


In [13]:
with open("../data/wiener_table_iv.txt", "r") as file:
    data = [line.strip().replace(' .', ' 0.').replace(' -.',' -0.').split() for line in file]

df_w_iv = pd.DataFrame(data)
df_w_iv[[1,2,3]]= df_w_iv[[1,2,3]].astype(float)

df_w_iv.rename(columns={
    0: 'name',
    1: 'delta_w',
    2: 'delta_p',
    3: 'bp_calc',
}, inplace=True)

print(df_w_iv.head())

                 name  delta_w  delta_p  bp_calc
0       3-Ethyloctane     20.0     -2.0    165.4
1       4-Ethyioctane     24.0     -2.0    161.5
2  2,2-Dimethyloctane     19.0      0.0    155.4
3  2,3-Dimethyloctane     22.0     -2.0    163.4
4  3,4-Dimethyloctane     28.0     -3.0    163.1


In [14]:
df_w_ii.to_csv("../data/wieners_data_ii.csv", sep=";",index=False)
df_w_iii.to_csv("../data/wieners_data_iii.csv", sep=";",index=False)
df_w_iv.to_csv("../data/wieners_data_iv.csv", sep=";",index=False)