In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from scipy.stats import gaussian_kde
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import *
from descriptor import *
import warnings

warnings.filterwarnings("ignore")

train_file = "../data/features/train_data.csv"
test_file = "../data/features/test_data.csv"

train_data = pd.read_csv(train_file,index_col=0)
test_data = pd.read_csv(test_file,index_col=0)

data = pd.concat([train_data,test_data],axis=0)

property_ = ["KVRH"]
soap_columns = list(data.columns)[198:]

columns = geometric + RAC + soap_columns + property_
new_columns = geometric + RAC + soap_columns
descriptors = data[new_columns]

drop_col = remove_constant_value_features(descriptors)
new_df_columns = [e for e in descriptors.columns if e not in drop_col]

X_train = train_data[new_df_columns]
X_test = test_data[new_df_columns]
y_train = train_data[property_]
y_test = test_data[property_]

In [None]:
import optuna
from sklearn import datasets

def objective(trial,X_train = X_train, y_train = y_train):
    
        
    learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.1, 0.005)
  
    depth = trial.suggest_int('depth', 5, 10)
                
    l2_leaf_reg = trial.suggest_discrete_uniform('l2_leaf_reg', 2.0, 30.0, 0.5)
        
    min_child_samples = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
        
    grow_policy = trial.suggest_categorical("grow_policy",["SymmetricTree", "Depthwise", "Lossguide"])
    
    accuracy = []
    
    kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

    loss = []

    for train_index2, valid_index2 in kf.split(X_train):

        X_train2, X_valid2 = X_train[train_index2], X_train[valid_index2]
        y_train2, y_valid2 = y_train[train_index2], y_train[valid_index2]

        model = CatBoostRegressor(random_state=42,
                                  learning_rate=learning_rate,
                                  depth = depth,
                                  l2_leaf_reg = l2_leaf_reg,
                                  min_child_samples = min_child_samples,
                                  grow_policy = grow_policy,
                                  silent = True) 

        model.fit(X_train2, y_train2) 

        y_pred2 = model.predict(X_valid2) 
        
        score = r2_score(y_valid2,y_pred2)
        
        rmse = mean_squared_error(y_valid2,y_pred2,squared = False)
        
        loss.append(rmse)
        
        accuracy.append(score)
        
    average_loss = np.mean(loss)
    average_score = np.mean(accuracy)    
    y_pred = model.predict(X_test)
    
    print(r2_score(y_test,y_pred))
    
    return average_score
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials = 1000)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print("best r2 is :",study.best_trial.values)

In [262]:
params = {'learning_rate': 0.08600000000000001, 
          'depth': 10, 
          'l2_leaf_reg': 5.0, 
          'min_child_samples': 8, 
          'grow_policy': 'Depthwise'}

In [None]:
model = CatBoostRegressor(random_state=42,**params)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(r2_score(y_test,y_pred))

# topology

In [283]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from scipy.stats import gaussian_kde
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import *
from descriptor import *
import warnings

warnings.filterwarnings("ignore")

train_file = "../data/features/train_data.csv"
test_file = "../data/features/test_data.csv"

train_data = pd.read_csv(train_file,index_col=0)
test_data = pd.read_csv(test_file,index_col=0)

data = pd.concat([train_data,test_data],axis=0)

from sklearn.preprocessing import OneHotEncoder
topology = np.array(data["topology"])
topology = topology.reshape(-1,1)
topology_encoder = OneHotEncoder()
topology_en_data = topology_encoder.fit_transform(topology).toarray()

In [284]:
train_topology = topology_en_data[:9261]
test_topology = topology_en_data[9261:]

In [285]:
train_geometric = np.array(train_data[geometric])
test_geometric =np.array(test_data[geometric])
X_train = np.concatenate([train_geometric,train_topology],axis=1)
X_test = np.concatenate([test_geometric,test_topology],axis=1)
y_train = np.array(train_data[property_])
y_test = np.array(test_data[property_])

# model = CatBoostRegressor(random_state=42)
# model.fit(X_train,y_train)
# y_pred_top = model.predict(X_test)
# print(r2_score(y_test,y_pred_top))

In [294]:
import optuna
from sklearn import datasets

def objective(trial,X_train = X_train, y_train = y_train):
    
        
    learning_rate = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.1, 0.005)
  
    depth = trial.suggest_int('depth', 5, 10)
                
    l2_leaf_reg = trial.suggest_discrete_uniform('l2_leaf_reg', 2.0, 30.0, 0.5)
        
    min_child_samples = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
        
    grow_policy = trial.suggest_categorical("grow_policy",["SymmetricTree", "Depthwise", "Lossguide"])
    
    accuracy = []


    model = CatBoostRegressor(random_state=42,
                              learning_rate=learning_rate,
                              depth = depth,
                              l2_leaf_reg = l2_leaf_reg,
                              min_child_samples = min_child_samples,
                              grow_policy = grow_policy,
                              silent = True,
                              iterations = 8000) 

    model.fit(X_train, y_train) 

    y_pred = model.predict(X_test)

    MAE = mean_absolute_error(y_test,y_pred)
    y_pred = model.predict(X_test)
    
    MAE = mean_absolute_error(y_test,y_pred)
    print(MAE)
    
    return MAE
    
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials = 100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print("best r2 is :",study.best_trial.values)

sklearn.ensemble._forest.RandomForestRegressor

In [None]:
params = {'learning_rate': 0.076, 'depth': 9, 'l2_leaf_reg': 3.0, 'min_child_samples': 1, 'grow_policy': 'SymmetricTree'}
model = CatBoostRegressor(random_state=42,
                          **params,
                          iterations = 8000) 
model.fit(X_train,y_train)
y_pred_top = model.predict(X_test)
print(r2_score(y_test,y_pred_top))