# Train SISSO

In [None]:
import os
import psutil
import numpy as np
import warnings
import pickle
from mlproject.data.preprocessing import get_dataset
from mlproject.training.fold_trainer import train_eval_fold
from mlproject.training.feature_selection import get_relevant_features
from sklearn.model_selection import KFold
import mlflow
warnings.filterwarnings("ignore")

In [None]:
model_type = "ga_sisso"

In [None]:
sissopp_binary_path = "/path/to/compiled/sisso++"

sissopp_inputs = {
    "data_file": "data.csv",
    "property_key": "",
    "desc_dim": 3,
    "n_sis_select": 10,
    "max_leaves": 4,
    "max_rung": 2,
    "calc_type": "regression",
    "min_abs_feat_val": 1e-5,
    "max_abs_feat_val": 1e8,
    "n_residual": 1,
    "n_models_store": 1,
    "n_rung_generate":1,
    "n_rung_store": 0,
    "leave_out_frac": 0.0,
    "leave_out_inds": [],
    "verbose": False,
    "opset": ["add", "sub", "abs_diff", "mult", "div", "inv", "abs", "exp", "log", "sq", "cb", "sqrt", "cbrt", "neg_exp"]
}

In [None]:
target_names = ["last_phdos_peak","max_pfc", 
                "log_g_vrh", "log_k_vrh", 
                "log_klat_300", "log_kp_300",
                "log_tdisp_all_300", "log_tdisp_all_600"]

In [None]:
parent_dir = os.getcwd()

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

**Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to `data_parent_dir` variable below**m

In [None]:
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"

**Provide absolute valid path to log the model metrics with mlflow in the `mlflow.set_tracking_uri` function below**

In [None]:
mlflow.set_tracking_uri('file:///some/valid/path/on/your/system')

In [None]:
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs

## GA based optimal feature selection for SISSO 

In [None]:
for target_name in target_names:
    for feat_type in ["matminer", "matminer_lob"]:
        target, feat = get_dataset(feat_type=feat_type, target_name=target_name,
        data_parent_dir=data_parent_dir)
        
        feat.dropna(axis=1, inplace=True)

        cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)

        experiment_name =  f"{model_type}_experiment_{target_name}"
        mlflow.set_experiment(experiment_name)

        os.makedirs(f"{model_type}_{target_name}_{feat_type}", exist_ok=True)
        os.chdir(f"{model_type}_{target_name}_{feat_type}")

        with mlflow.start_run(run_name=f"{target_name}_{feat_type}", 
                              experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
            all_results = {
                "train_mae": [], "test_mae": [],
                "train_rmse": [], "test_rmse": [], 
                "train_r2": [], "test_r2": [],
                "train_mape": [], "test_mape": []
            }
            for fold_ind, (train_ix, test_ix) in enumerate(cv_outer.split(feat)):
                X_train, X_test = feat.iloc[train_ix], feat.iloc[test_ix]
                y_train, y_test = target.iloc[train_ix, 0], target.iloc[test_ix, 0]

                sissopp_inputs["property_key"] = target_name
            
                pipe, X_train_fil = get_relevant_features(
                        X_train=X_train, 
                        y_train=y_train.values.flatten(), 
                        grootcv_n_iter=50,
                        **{"all_rel_feats__n_jobs": num_jobs})

                with open(f"{fold_ind+1}_pipeline.pkl", "wb") as f:
                    pickle.dump(pipe, f)
            
                X_test_fil = X_test.loc[:, X_train_fil.columns]
            
                result = train_eval_fold(
                        fold_ind=fold_ind, 
                        X_train=X_train_fil, 
                        y_train=y_train, 
                        X_test=X_test_fil, 
                        y_test=y_test,
                        model_type="ga_sisso",
                    **{"num_features": 10,
                      "sissopp_binary_path": sissopp_binary_path,
                       "sissopp_inputs": sissopp_inputs,
                      "mpi_tasks": num_jobs, # please set this as per your system
                      "population_size": 20,
                      "generations": 50}
                        )

                for metric, value in result.items():
                    if isinstance(value, float):
                        all_results[metric].append(value)
                    elif "train" in metric:
                        all_results["train_mae"].append(value.mean())
                    else:
                        all_results["test_mae"].append(value.mean())

                with open(f"{fold_ind+1}_results.pkl", "wb") as f:
                    pickle.dump(result, f)

            for metric, value in all_results.items():
                mlflow.log_metric(f"{metric}_mean", np.array(value).mean())
                mlflow.log_metric(f"{metric}_min", np.array(value).min())
                mlflow.log_metric(f"{metric}_max", np.array(value).max())
                mlflow.log_metric(f"{metric}_std", np.array(value).std())

            os.chdir(parent_dir)
            

## Final Model training on complete dataset to arrive at equations

In [None]:
from sissopp import Inputs
from sissopp.postprocess.load_models import load_model
from sissopp import FeatureSpace, SISSORegressor

### Train Rung 1 or 2 SISSO model with optimal descriptors from GA-runs on complete dataset

In [None]:
sissopp_inputs = {
    "data_file": "data.csv",
    "property_key": "log_klat_300",
    "desc_dim": 3,
    "n_sis_select": 100,
    "max_leaves": 4,
    "max_rung": 2, # simply change this value to 1 for rung 1 models
    "calc_type": "regression",
    "min_abs_feat_val": 1e-5,
    "max_abs_feat_val": 1e8,
    "n_residual": 1,
    "n_models_store": 1,
    "n_rung_generate":1,
    "n_rung_store": 0,
    "leave_out_frac": 0.0,
    "leave_out_inds": [],
    "verbose": False,
    "opset": ["add", "sub", "abs_diff", "mult", "div", "inv", "abs", "exp", "log", "sq", "cb", "sqrt", "cbrt", "neg_exp"]
}

In [None]:
for target in target_names:
    
    # Aggregate all optimal descriptors identified in all 5 Folds of GA run 
    feats=[]
    for i in range(1,6):
        feats.extend(list(pd.read_csv(f"{parent_dir}/sisso_{target}_matminer_lob/fold_{i}/data.csv", index_col=0).columns[:-1]))

    target_df, feat_df = get_dataset(feat_type="matminer_lob", target_name=target,
        data_parent_dir=data_parent_dir)

    os.makedirs(target,exist_ok=True)

    model_input = sissopp_inputs.copy()

    model_input["property_key"] = target

    work_dir = Path(f"{target}/rung_2")
    work_dir.mkdir(exist_ok=True)
    os.chdir(work_dir)

    # Save input datafile for with 
    pd.concat([feat_df.loc[:, list(set(feats))], target_df], axis=1).to_csv("data.csv")

    with open("sisso.json", "w") as f:
        json.dump(model_input, f, indent=4)

    inputs_base = Inputs("sisso.json")

    feature_space = FeatureSpace(inputs_base)

    sisso = SISSORegressor(inputs_base, feature_space)

    sisso.fit()

    os.chdir(parent_dir)

### Load saved models to get the equations

In [None]:
for target in target_names:
    
    print(target)

    m = load_model(train_file=f"{target}/rung_1/models/train_dim_2_model_0.dat")
    print(m.latex_str.replace(",", "\_"))
    print("")