# Parameters optimisation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import typing
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from typing import Union, List, Tuple
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import GraphDescriptors
from tqdm import tqdm

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neural_network import MLPRegressor

### Helper functions

In [None]:
def find_pattern_count(molecule: Chem.Mol, pattern: Chem.Mol) -> int:
    result = molecule.GetSubstructMatches(pattern)
    return len(result)

def drawSmils(smils: str):
    smilsChromoChem = Chem.MolFromSmiles(smils)
    drawing = Draw.MolToMPL(smilsChromoChem, size=(450,450))


# Read data

In [None]:
qm9_database = pd.read_csv("qm9.csv")
qm9_database["RDKit_Mol_Class"] = qm9_database["smiles"].apply(Chem.MolFromSmiles)
only_solids = pd.read_json("only_solids_features.json").reset_index(drop=True)
only_solids_conf = pd.read_json("only_solids_conf_features.json").reset_index(drop=True)
only_solids["RDKit_Mol_Class"] = only_solids["Chromophore"].apply(Chem.MolFromSmiles)
only_solids_conf["RDKit_Mol_Class"] = only_solids_conf["Chromophore"].apply(Chem.MolFromSmiles)

In [None]:
first_4_cols = ["Chromophore", 'qm9_pattern_indexes', 'RDKit_Mol_Class', "Emission max (nm)"]
only_solids = pd.concat([only_solids[first_4_cols], only_solids.drop(columns=first_4_cols)], axis=1)
only_solids_conf = pd.concat([only_solids_conf[first_4_cols], only_solids_conf.drop(columns=first_4_cols)], axis=1)

# Below is important cell

#### toggle between dataset 1 and 2

In [None]:
# only_solids = only_solids_conf

In [None]:
descriptors_names = [ x[0] for x in Descriptors._descList ]
len(descriptors_names)

In [None]:
scaler = StandardScaler()
scaler.fit(only_solids[descriptors_names])
only_solids = pd.concat([only_solids[only_solids.columns.drop(descriptors_names)].reset_index(drop=True), pd.DataFrame(scaler.transform(only_solids[descriptors_names]), columns=descriptors_names)], axis=1)

In [None]:
only_solids.drop(only_solids.std()[only_solids.std()==0].index, axis=1, inplace=True)
only_solids.shape

In [None]:
nans_cols = only_solids.columns[only_solids.isna().any()]
nans_cols

In [None]:
only_solids.dropna(subset=nans_cols, axis=0, inplace=True)
only_solids.shape

In [None]:
universal_features = list(only_solids.columns)[4:]
universal_features_conf = list(only_solids_conf.columns)[4:]

In [None]:
model1_data_conf_x = only_solids[universal_features]
model1_data_conf_y = only_solids["Emission max (nm)"]

In [None]:
qm9_database["chemical_potential"] = (qm9_database["homo"]+qm9_database["lumo"])/2
qm9_database["electrophilicity"] = qm9_database["chemical_potential"]**2/(2*qm9_database["gap"])

In [None]:
all_quantum_features = list(qm9_database.columns.drop(["RDKit_Mol_Class", "mol_id", "smiles"]))

In [None]:
cols = ["n_features", "max_depth", "n_est", "mae"]
n_estimators_range = list(range(100, 1050, 50))
min_samples_range = list(range(1, 40))
features_range = np.linspace(0.05, 1, 20)
depth_range = list(range(3, 7))

In [None]:
scores_model1 = pd.DataFrame(columns=cols)
scores_model2 = scores_model1.copy()

## GBR

### Model 1

In [None]:
data_x = model1_data_conf_x
data_y = model1_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
#     "neg_mean_squared_error",
#     "r2",
#     "max_error",
    "neg_mean_absolute_error",
]
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        for est in n_estimators_range:
            scores_dict["n_features"] = features
            scores_dict["max_depth"] = depth
            scores_dict["n_est"] = est
            gbr = GradientBoostingRegressor(learning_rate=0.05,
                                            max_depth=depth,
                                            max_features=features,
                                            n_estimators=est,
                                           random_state=1)
            for scor_type in scores:
                scores_dict["mae"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
                print(f"GBR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
            scores_model1 = scores_model1.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model1.to_json("gbr_scores1_model1.json")
scores_model1 = pd.DataFrame(columns=cols)

In [None]:
data_x = model1_data_conf_x
data_y = model1_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
    "neg_mean_squared_error",
    "r2",
    "max_error",
#     "neg_mean_absolute_error",
]
cols = ["n_features", "max_depth", "n_est"] + scores
scores_dict = {col: 0 for col in cols}
depth = 4
for features in features_range:
    for est in n_estimators_range:
        scores_dict["n_features"] = features
        scores_dict["max_depth"] = depth
        scores_dict["n_est"] = est
        gbr = GradientBoostingRegressor(learning_rate=0.05,
                                        max_depth=5,
                                        max_features=features,
                                        n_estimators=est,
                                       random_state=1)
        for scor_type in scores:
            scores_dict[f"{scor_type}"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
            print(f"GBR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
        scores_model1 = scores_model1.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model1.to_json("gbr_scores2_model1.json")
scores_model1 = pd.DataFrame(columns=cols)

### Model 2

In [None]:
def get_features(row, database: pd.DataFrame, features: Union[List[str], str]) -> None:
    for index in row["qm9_pattern_indexes"]:
        count = find_pattern_count(row["RDKit_Mol_Class"], database["RDKit_Mol_Class"][index])
        for feature in features:
            value = count*database[feature][index]
            row[feature] += value
    return row

In [None]:
features = all_quantum_features

In [None]:
only_solids[features] = 0

In [None]:
only_solids = only_solids.apply(get_features, database=qm9_database, features=features, axis=1)

In [None]:
model2_data_conf_x = only_solids[universal_features+features]
model2_data_conf_y = only_solids["Emission max (nm)"]

In [None]:
data_x = model2_data_conf_x
data_y = model2_data_conf_y

In [None]:
data_x = model2_data_conf_x
data_y = model2_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
#     "neg_mean_squared_error",
#     "r2",
#     "max_error",
    "neg_mean_absolute_error",
]
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        for est in n_estimators_range:
            scores_dict["n_features"] = features
            scores_dict["max_depth"] = depth
            scores_dict["n_est"] = est
            gbr = GradientBoostingRegressor(learning_rate=0.05,
                                            max_depth=depth,
                                            max_features=features,
                                            n_estimators=est,
                                           random_state=1)
            for scor_type in scores:
                scores_dict["mae"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
                print(f"GBR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
            scores_model2 = scores_model2.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
data_x = model2_data_conf_x
data_y = model2_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
    "neg_mean_squared_error",
    "r2",
    "max_error",
#     "neg_mean_absolute_error",
]
cols = ["n_features", "max_depth", "n_est"] + scores
scores_dict = {col: 0 for col in cols}
depth = 3
for features in features_range:
    for est in n_estimators_range:
        scores_dict["n_features"] = features
        scores_dict["max_depth"] = depth
        scores_dict["n_est"] = est
        gbr = GradientBoostingRegressor(learning_rate=0.05,
                                        max_depth=depth,
                                        max_features=features,
                                        n_estimators=est,
                                       random_state=1)
        for scor_type in scores:
            scores_dict[f"{scor_type}"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
            print(f"GBR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
        scores_model2 = scores_model2.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model2.to_json("gbr_scores2_model2.json")
scores_model2 = pd.DataFrame(columns=cols)

## RFR

### Model 1

In [None]:
data_x = model1_data_conf_x
data_y = model1_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
#     "neg_mean_squared_error",
#     "r2",
#     "max_error",
    "neg_mean_absolute_error",
]
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        for est in n_estimators_range:
            scores_dict["n_features"] = features
            scores_dict["max_depth"] = depth
            scores_dict["n_est"] = est
            gbr = RandomForestRegressor(max_depth=depth,
                                            max_features=features,
                                            n_estimators=est,
                                           random_state=1)
            for scor_type in scores:
                scores_dict["mae"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
                print(f"RFR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
            scores_model1 = scores_model1.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model1.to_json("rfr_scores1_model1.json")
scores_model1 = pd.DataFrame(columns=cols)

### Model 2

In [None]:
data_x = model2_data_conf_x
data_y = model2_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
#     "neg_mean_squared_error",
#     "r2",
#     "max_error",
    "neg_mean_absolute_error",
]
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        for est in n_estimators_range:
            scores_dict["n_features"] = features
            scores_dict["max_depth"] = depth
            scores_dict["n_est"] = est
            gbr = RandomForestRegressor(max_depth=depth,
                                            max_features=features,
                                            n_estimators=est,
                                           random_state=1)
            for scor_type in scores:
                scores_dict["mae"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
                print(f"RFR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}"))
            scores_model2 = scores_model2.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model2.to_json("rfr_scores1_model2.json")
scores_model2 = scores_model1.copy()

In [None]:
depth_range = list(range(3, 10))

In [None]:
data_x = model1_data_conf_x
data_y = model1_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
    "neg_mean_squared_error",
    "r2",
    "max_error",
    "neg_mean_absolute_error",
]
est = 600
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        scores_dict["n_features"] = features
        scores_dict["max_depth"] = depth
        scores_dict["n_est"] = 600
        gbr = RandomForestRegressor(max_depth=depth,
                                        max_features=features,
                                        n_estimators=600,
                                       random_state=1)
        for scor_type in scores:
            scores_dict[f"{scor_type}"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
            print(f"RFR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
        scores_model1 = scores_model1.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model1.to_json("rfr_scores2_model1.json")
scores_model1 = pd.DataFrame(columns=cols)

In [None]:
data_x = model2_data_conf_x
data_y = model2_data_conf_y
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = [
    "neg_mean_squared_error",
    "r2",
    "max_error",
    "neg_mean_absolute_error",
]
est = 600
scores_dict = {col: 0 for col in cols}
for depth in depth_range:
    for features in features_range:
        scores_dict["n_features"] = features
        scores_dict["max_depth"] = depth
        scores_dict["n_est"] = 600
        gbr = RandomForestRegressor(max_depth=depth,
                                        max_features=features,
                                        n_estimators=600,
                                       random_state=1)
        for scor_type in scores:
            scores_dict[f"{scor_type}"] = cross_val_score(gbr, data_x, data_y, cv=kf, scoring=scor_type).mean()
            print(f"RFR max_depth: {depth} max_features: {features} n_est: {est} {scor_type}: {scores_dict[f'{scor_type}']}")
        scores_model2 = scores_model2.append(pd.DataFrame(scores_dict, index=[0]), ignore_index=True)

In [None]:
scores_model2.to_json("rfr_scores2_model2.json")
scores_model2 = pd.DataFrame(columns=cols)