In [1]:
import json
from ecnet.datasets import load_cp

def get_property(compounds: list, property: str, lim: int = None) -> tuple:

    prop_vals = []
    smiles = []
    for c in compounds:
        try:
            _val = [float(c['properties'][property]['value'])]
        except KeyError:
            continue
        if lim is not None:
            if _val[0] > lim:
                continue
        prop_vals.append(_val)
        smiles.append(c['canonical_smiles'])
    return (prop_vals, smiles)


with open('compounds.json', 'r') as jfile:
    compounds = json.load(jfile)
jfile.close()

cn, smiles_cn = get_property(compounds, 'cetane_number', 100)
ysi, smiles_ysi = get_property(compounds, 'ysi_unified')
lhv, smiles_lhv = get_property(compounds, 'lower_heating_value')
kv, smiles_kv = get_property(compounds, 'kinematic_viscosity')
smiles_cp, cp = load_cp()
fp, smiles_fp = get_property(compounds, 'flash_point')

print(len(cn), len(ysi), len(lhv), len(kv), len(cp), len(fp))

391 545 385 205 43 255


In [2]:
from ecnet.datasets import QSPRDataset

ds_cn = QSPRDataset(smiles_cn, cn, backend='alvadesc')
ds_ysi = QSPRDataset(smiles_ysi, ysi, backend='alvadesc')
ds_kv = QSPRDataset(smiles_kv, kv, backend='alvadesc')
ds_cp = QSPRDataset(smiles_cp, cp, backend='alvadesc')
ds_lhv = QSPRDataset(smiles_lhv, lhv, backend='alvadesc')
ds_fp = QSPRDataset(smiles_fp, fp, backend='alvadesc')

In [3]:
from ecnet.tasks import select_rfr

idx_cn, imp_cn = select_rfr(ds_cn, n_estimators=100)
idx_ysi, imp_ysi = select_rfr(ds_ysi, n_estimators=100)
idx_kv, imp_kv = select_rfr(ds_kv, n_estimators=100)
idx_cp, imp_cp = select_rfr(ds_cp, n_estimators=100)
idx_lhv, imp_lhv = select_rfr(ds_lhv, n_estimators=100)
idx_fp, imp_fp = select_rfr(ds_fp, n_estimators=100)

In [4]:
names_cn = [ds_cn.desc_names[i] for i in idx_cn]
names_ysi = [ds_ysi.desc_names[i] for i in idx_ysi]
names_kv = [ds_kv.desc_names[i] for i in idx_kv]
names_cp = [ds_cp.desc_names[i] for i in idx_cp]
names_lhv = [ds_lhv.desc_names[i] for i in idx_lhv]
names_fp = [ds_fp.desc_names[i] for i in idx_fp]

In [5]:
from csv import DictWriter

headers = ['Property', 'Desc. Rank', 'Desc. Imp.', 'Desc. Name']

rows_cn = []
rows_ysi = []
rows_kv = []
rows_cp = []
rows_lhv = []
rows_fp = []
for i in range(10):
    rows_cn.append({
        'Property': 'CN',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_cn[i],
        'Desc. Name': names_cn[i]
    })
    rows_ysi.append({
        'Property': 'YSI',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_ysi[i],
        'Desc. Name': names_ysi[i]
    })
    rows_kv.append({
        'Property': 'KV',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_kv[i],
        'Desc. Name': names_kv[i]
    })
    rows_cp.append({
        'Property': 'CP',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_cp[i],
        'Desc. Name': names_cp[i]
    })
    rows_lhv.append({
        'Property': 'LHV',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_lhv[i],
        'Desc. Name': names_lhv[i]
    })
    rows_fp.append({
        'Property': 'FP',
        'Desc. Rank': i + 1,
        'Desc. Imp.': imp_fp[i],
        'Desc. Name': names_fp[i]
    })

with open('descriptor_ranks.csv', 'w', encoding='utf8') as csv_file:
    writer = DictWriter(csv_file, headers, delimiter=',', lineterminator='\n')
    writer.writeheader()
    writer.writerows(rows_cn)
    writer.writerows(rows_ysi)
    writer.writerows(rows_kv)
    writer.writerows(rows_cp)
    writer.writerows(rows_lhv)
    writer.writerows(rows_fp)