In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, KFold
import warnings
from numpy import savetxt
from numpy import loadtxt
warnings.filterwarnings("ignore")
import pickle

In [None]:
from sklearn.metrics import r2_score
import optuna
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import math
import matplotlib.pyplot as plt
import shap

In [None]:
train = loadtxt('Data_Set/train.csv', delimiter=',')
label = loadtxt('Data_Set/label.csv', delimiter=',')

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.15, random_state=42)

In [None]:
def trainingWithKfolds():
    params= {'iterations': 16937, 'learning_rate': 0.0802044556274633, 'depth': 7, 'random_seed': 397, 
             'metric_period': 407, 'od_wait': 148}
    k_fold=KFold(n_splits=6)
    for tr, tst in k_fold.split(train,label):


        model_regressor = CatBoostRegressor(**params)

        X_train, X_test, y_train, y_test = train[tr],train[tst],label[tr],label[tst]
        model_regressor.fit(X_train, y_train, 
          eval_set=(X_test, y_test),  
          use_best_model=True,  
          plot= False   
         );

In [None]:
def CatBoostRegressorTraining():
    params = {'iterations': 16937, 'learning_rate': 0.0802044556274633, 'depth': 7, 'random_seed': 397,
              'metric_period': 407, 'od_wait': 148}
    
    xgb_reg = CatBoostRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def AdaBoostRegressorTraining():
    params = {'n_estimators': 6, 'learning_rate': 0.038071344583599565, 'random_state': 47}
    
    xgb_reg = AdaBoostRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def KNeighborsRegressorTraining():
    params = {'n_neighbors': 27, 'leaf_size': 16, 'p': 1, 'n_jobs': 2}
    
    xgb_reg = KNeighborsRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def RandomForestRegressorTraining():
    params = {'criterion': 'friedman_mse', 'n_estimators': 65, 'min_samples_leaf': 0.3106657191461476,
               'min_samples_split': 0.11843070888391496,'min_weight_fraction_leaf': 0.19561280836043243,
               'max_depth': 49, 'n_jobs': 4, 'ccp_alpha': 0.7168576832158675}
    
    xgb_reg = RandomForestRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def SVRTraining():
    params = {'coef0': 5.427321055348509, 'tol': 0.16431922654212472, 'epsilon': 0.10050992331193838,
               'C': 2.8667365671378473, 'degree': 7,'max_iter': 93, 'cache_size': 337}
    
    xgb_reg = SVR(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def GradientBoostingRegressorTraining():
    params = {'learning_rate': 0.053695566279521385, 'alpha': 0.8265623491581545, 'loss': 'huber', 
               'criterion': 'friedman_mse','n_estimators': 165,
               'min_samples_leaf': 0.0010995785640797642, 'min_samples_split': 0.020181427769510703,
               'min_weight_fraction_leaf': 0.009433744467751696, 'max_depth': 40,
               'min_impurity_decrease': 0.011243612527949207}
    
    xgb_reg = GradientBoostingRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [None]:
def showResults(model, modelName):
    y_ped = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_ped)
    RMSE = math.sqrt(MSE)
    resultsDict = {'R2: ':r2_score(y_test,model.predict(X_test)), 'MSE: ':MSE, 'MAE: ':mean_absolute_error(y_test,y_ped),
           'RMSE: ':RMSE}

    # open file for writing
    name = "Results/" + modelName + ".txt"
    
    # save the model to disk
    filename = "Saved_Models/" + modelName + ".sav"
    pickle.dump(model, open(filename, 'wb'))
    
    f = open(name,"w")
    # write file
    f.write( str(resultsDict) )
    # close file
    f.close()
    print(resultsDict)

In [None]:
def drawPlots(model,modelName):
    y_ped = model.predict(X_test)
    plt.scatter(y_ped,y_test,s=10,marker ="s",c=y_test,cmap=plt.get_cmap('plasma'))
    plt.plot(y_test,y_test)
    plt.xlabel('Experimental Bandgap \n a')
    plt.ylabel('Predicated Bandgap')
    plt.title(modelName + ' Bandgap')
    plt.grid(False)
    name = "Visualization/" + modelName + ".png"
    plt.savefig(name, dpi=400,transparent=True,bbox_inches = "tight")
    plt.show()

In [None]:
def drawShapAnanlysis(model,modelName,features):
    plt.clf()
    explainer = shap.Explainer(model,feature_names=features)
    shap_values = explainer(X_train)
#     shap.plots.bar(shap_values)
    shap.plots.beeswarm(shap_values).figure.savefig('shap.pdf')
#     plt.savefig("gfgd.png",dpi=400, bbox_inches='tight')

#     fig.savefig("shap.png", dpi=400,transparent=True,bbox_inches = "tight")
#     plt.savefig('shap.png')


In [None]:
# drawShapAnanlysis(model,"CatBoostRegressor",features)

In [None]:
features=['Cs', 'FA', 'MA','Pb', 'Sn','Br', 'I','Thickness']

In [None]:
drawShapAnanlysis(model,"CatBoostRegressor",features)

In [None]:
modelCat = CatBoostRegressorTraining()
showResults(modelCat,"CatBoostRegressor")
drawPlots(modelCat,"CatBoostRegressor")

In [None]:
# drawShapAnanlysis(model,"CatBoostRegressor",features)

In [None]:
y_ped = modelCat.predict(X_test)
pred_y_df = pd.DataFrame({'Actual Value':y_test,'Predicated Value':y_ped,'Difference':y_test-y_ped})

In [None]:
pred_y_df

In [None]:
model = AdaBoostRegressorTraining()
showResults(model,"AdaBoostRegressor")
drawPlots(model,"AdaBoostRegressor")

In [None]:
model = KNeighborsRegressorTraining()
showResults(model,"KNeighborsRegressor")
drawPlots(model,"KNeighborsRegressor")

In [None]:
model = RandomForestRegressorTraining()
showResults(model,"RandomForestRegressor")
drawPlots(model,"RandomForestRegressor")

In [None]:
model = SVRTraining()
showResults(model,"SVR")
drawPlots(model,"SVR")

In [None]:
model = GradientBoostingRegressorTraining()
showResults(model,"GradientBoostingRegressor")
drawPlots(model,"GradientBoostingRegressor")

In [None]:
# load the model from disk
# filename = "Saved_Models/" + "CatBoostRegressor" + ".sav"
# loaded_model = pickle.load(open(filename, 'rb'))