In [None]:
# For data generation
import sys
# !{sys.executable} -m pip install matplotlib --upgrade
import shutil
import os
import numpy as np
from glob import glob
import re
import pandas as pd
import pickle
#random
from time import perf_counter

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,root_mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV


#Plotting
import seaborn as sns
sns.set_style()
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi'] = 200

import warnings
# warnings.filterwarnings("ignore", category=np.DeprecationWarning) 


from tqdm.auto import tqdm
# from tqdm.notebook import tqdm_notebook as tqdm

In [None]:
with open(os.path.join(os.path.expanduser('~'),'DDCASPT2/drop.txt'),'r') as d:
    dropfeat = [i.replace('\n','') for i in d.readlines()]

In [None]:
# with open('test_ind.pickle', 'rb') as handle:
#     test_ind = pickle.load(handle)

# with open('train_ind.pickle', 'rb') as handle:
#     train_ind = pickle.load(handle)

with open('big_test_ind.pickle', 'rb') as handle:
    test_ind = pickle.load(handle)

with open('big_train_ind.pickle', 'rb') as handle:
    train_ind = pickle.load(handle)
# train_ind = train_ind[train_ind>=110]
# test_ind = test_ind[test_ind>=110]    
train_ind = list(map(float,train_ind))
test_ind = list(map(float,test_ind))
train_ind, test_ind = train_test_split(train_ind+test_ind,test_size=0.1, random_state=42)
print(len(train_ind),len(test_ind))    


In [None]:
len(train_ind+test_ind)

In [None]:
rename = {'h$_{qq}^{0}$':'h$_{q}$',
'$(F_{q})_{0}$':'$F_{q}$',
'$(F_{q}^{\\text{SCF}})_{0}$':'$F_{q}^{\\text{SCF}}$',
'$(\\eta_{q})_{0}$':'$\\eta_{q}$',
'$(\\omega_{q})_{0}$':'$\\omega_{q}$',
'$(\\eta_{s})_{0}$':'$\\eta_{s}$',
'h$_{ss}^{0}$':'h$_{s}$',
'$(F_{s}^{\\text{SCF}})_{0}$':'$F_{s}^{\\text{SCF}}$',
'$(F_{s})_{0}$':'$F_{s}$',
'$(\\omega_{s})_{0}$':'$\\omega_{s}$',
'$(\\langle ss \\vert ss \\rangle)_{0}$':"$\\langle ss \\vert ss \\rangle$",
'$(\\langle qq \\vert qq \\rangle)_{0}$':"$\langle qq \\vert qq \\rangle$",
'h$_{pp}^{0}$': '(h$_{p}$)$_{0}$',
'h$_{pp}^{1}$': '(h$_{p}$)$_{1}$',
'h$_{pp}^{2}$': '(h$_{p}$)$_{2}$',
'h$_{pp}^{3}$': '(h$_{p}$)$_{3}$',
'h$_{rr}^{0}$': '(h$_{r}$)$_{0}$',
'h$_{rr}^{1}$': '(h$_{r}$)$_{1}$',
'h$_{rr}^{2}$': '(h$_{r}$)$_{2}$',
'h$_{rr}^{3}$': '(h$_{r}$)$_{3}$',
'h$_{pq}^{0}$':'(h$_{pq}$)$_{0}$',
'h$_{pq}^{1}$':'(h$_{pq}$)$_{1}$',
'h$_{pq}^{2}$':'(h$_{pq}$)$_{2}$',
'h$_{pq}^{3}$':'(h$_{pq}$)$_{3}$',
'h$_{pr}^{0}$':'(h$_{pr}$)$_{0}$',
'h$_{pr}^{1}$':'(h$_{pr}$)$_{1}$',
'h$_{pr}^{2}$':'(h$_{pr}$)$_{2}$',
'h$_{pr}^{3}$':'(h$_{pr}$)$_{3}$',
'h$_{rs}^{0}$':'(h$_{rs}$)$_{0}$',
'h$_{rs}^{1}$':'(h$_{rs}$)$_{1}$',
'h$_{rs}^{2}$':'(h$_{rs}$)$_{2}$',
'h$_{rs}^{3}$':'(h$_{rs}$)$_{3}$',
'typ_0':'$type_0$',
'typ_1':'$type_1$',
'typ_2':'$type_2$',
'typ_3':'$type_3$'}

In [None]:
def grab_state_data(state,dropfeat):
    train = []
    test = []
    
    
    
    recover_train = []
    recover_test = []
    
    traincnt = 0
    testcnt = 0
    # TZP: 1e-5
    # DZP: 1e-5
    # mb: 1e-3
    for i in sorted(glob('ANO-RCC-VTZP/')):
        for j in glob(os.path.join(i,'ozone_*')):
            for k in train_ind:
                if f"{k:.2f}" in j:
                    traindf = pd.read_csv(os.path.join(j,os.path.basename(j)+f'_{state}.csv'),index_col=0)
                    # traindf = traindf[traindf['Pair_Energies'].abs()>=1e-5]
                    train.append(traindf)
                    recover_train.append((f"state_{state}",f"{k:.2f}",traincnt,len(traindf)))
                    traincnt+=1
                    
            for l in test_ind:
                if f"{l:.2f}" in j:
                    testdf = pd.read_csv(os.path.join(j,os.path.basename(j)+f'_{state}.csv'),index_col=0)
                    # testdf = testdf[testdf['Pair_Energies'].abs()>=1e-5]
                    test.append(testdf)                    
                    recover_test.append((f"state_{state}",f"{l:.2f}",testcnt,len(testdf)))
                    testcnt+=1
    
    dropfeat = [a for a in dropfeat if a not in ['D$_{qs}$', 'F$_{qs}$', 'FA$_{qs}$', 'FI$_{qs}$']]
    train_df = pd.concat(train).drop(columns=dropfeat).rename(columns=rename)
    test_df = pd.concat(test).drop(columns=dropfeat).rename(columns=rename)                    

    return train_df, test_df, recover_train, recover_test

In [None]:
# train_df, test_df, recover_train, recover_test = grab_state_data(1,dropfeat)
# train_df, test_df, recover_train, recover_test = grab_state_data(2,dropfeat)
# train_df, test_df, recover_train, recover_test = grab_state_data(3,dropfeat)

train_df_1, test_df_1, recover_train_1, recover_test_1 = grab_state_data(1,dropfeat)
train_df_2, test_df_2, recover_train_2, recover_test_2 = grab_state_data(2,dropfeat)
train_df_3, test_df_3, recover_train_3, recover_test_3 = grab_state_data(3,dropfeat)


train_df = pd.concat([train_df_1,train_df_2,train_df_3])
test_df = pd.concat([test_df_1,test_df_2,test_df_3])

recover_train = recover_train_1+recover_train_2+recover_train_3
recover_test = recover_test_1+recover_test_2+recover_test_3

In [None]:
X_train = train_df.drop(columns=['Pair_Energies']).values
X_test = test_df.drop(columns=['Pair_Energies']).values

y_train = train_df['Pair_Energies'].values
y_test = test_df['Pair_Energies'].values

In [None]:
y_train.shape[0] / (y_train.shape[0]+y_test.shape[0])

In [None]:
X_train.shape

In [None]:
scaler=MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
from xgboost import XGBRegressor

kwargs = dict(pd.read_excel("ozone_VTZP_MS_ALL_90_10_split_params.xlsx").values)
model=XGBRegressor(**kwargs)
model.fit(X_train,y_train)
y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)
print(f"R2: {r2_score(y_train,y_pred_train):.4f},{r2_score(y_test,y_pred_test):.4f}")
print(f"RMSE (mEh): {root_mean_squared_error(y_train,y_pred_train)*1e3:.4f},{root_mean_squared_error(y_test,y_pred_test)*1e3:.4f}")

# params = {'max_depth': [1, 10, 100],
#           'n_estimators': [100, 500, 1000],
#           'reg_lambda': [1e-6, 1e-3,1e-1],
#           'reg_alpha': [1e-6, 1e-3,1e-1]}

# model = XGBRegressor()
# grid = GridSearchCV(estimator=model, 
#                    param_grid=params,
#                    scoring='r2', 
#                    verbose=100,n_jobs=12).fit(X_train,y_train)


# model=grid.best_estimator_
# y_pred_train=model.predict(X_train)
# y_pred_test=model.predict(X_test)
# print(f"R2: {r2_score(y_train,y_pred_train):.4f},{r2_score(y_test,y_pred_test):.4f}")
# print(f"RMSE (mEh): {root_mean_squared_error(y_train,y_pred_train)*1e3:.4f},{root_mean_squared_error(y_test,y_pred_test)*1e3:.4f}")
# pd.DataFrame.from_dict(model.get_params(),orient='index').dropna().to_excel("ozone_VTZP_MS_ALL_90_10_split_params.xlsx")

In [None]:
with open("y_train_ALL.npy", 'wb') as f:
    np.save(f,y_train)

with open("y_pred_train_ALL.npy", 'wb') as f:
    np.save(f,y_pred_train)  


with open("y_test_ALL.npy", 'wb') as f:
    np.save(f,y_test)

with open("y_pred_test_ALL.npy", 'wb') as f:
    np.save(f,y_pred_test)  
# y_train,y_pred_train

In [None]:
pd.DataFrame.from_dict(model.get_params(),orient='index').dropna().to_excel("ozone_VTZP_MS_ALL_90_10_split_params.xlsx")

In [None]:
try:
    os.mkdir('images')
except:
    print('./images exist')

In [None]:
y_train.min()*1e3,y_train.max()*1e3

In [None]:
pairedcp = sns.color_palette("Paired")
font=12
plt.rc('font', size=font)          # controls default text sizes
plt.rc('axes', titlesize=font)     # fontsize of the axes title
plt.rc('axes', labelsize=font)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=font)    # fontsize of the tick labels
plt.rc('ytick', labelsize=font)    # fontsize of the tick labels
plt.rc('legend', fontsize=font)    # legend fontsize
plt.rc('figure', titlesize=font)  # fontsize of the figure title
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(13,6))
scale=1e-1
minL,maxL = -70, 30
ax1.set_title('Train')
ax1.scatter(1e3*y_train,1e3*y_pred_train,label="R$^{2}$="+f'{r2_score(y_train,y_pred_train):.4f}\nMAE={1e3*mean_absolute_error(y_train,y_pred_train):.4f}'+" mE$_{h}$",color=pairedcp[0],edgecolors='k')
ax1.plot(np.arange(-100,100),np.arange(-100,100),'k-')
ax1.set_xlim(minL,maxL)
ax1.set_ylim(minL,maxL)
ax1.set_xlabel('Calculated Pair-Energies (mE$_{h}$)')
ax1.set_ylabel('Predicted Pair-Energies (mE$_{h}$)')
ax1.legend()


ax2.set_title('Test')
ax2.scatter(1e3*y_test,1e3*y_pred_test,label="R$^{2}$="+f'{r2_score(y_test,y_pred_test):.4f}\nMAE={1e3*mean_absolute_error(y_test,y_pred_test):.4f}'+" mE$_{h}$",color=pairedcp[1],edgecolors='k')
ax2.plot(np.arange(-100,100),np.arange(-100,100),'k-')
ax2.set_xlim(minL,maxL)
ax2.set_ylim(minL,maxL)
ax2.set_xlabel('Calculated Pair-Energies (mE$_{h}$)')
ax2.set_ylabel('Predicted Pair-Energies (mE$_{h}$)')
ax2.legend()

plt.tight_layout()
plt.savefig('images/ozone_VTZP_MS_ALL_90_10_split_parity.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
testcntrcvr = 0
recover_test_list = []
recover_test_E2 = []
for i,j,k,l in recover_test:
    print(i,j,k,l)
    init_test = testcntrcvr
    testcntrcvr+=l
    recover_test_list.append((i.split("_")[1],j,y_test[init_test:testcntrcvr],y_pred_test[init_test:testcntrcvr]))
    recover_test_E2.append((i.split("_")[1],j,np.sum(y_test[init_test:testcntrcvr]),np.sum(y_pred_test[init_test:testcntrcvr])))


traincntrcvr = 0
recover_train_list = []
recover_train_E2 = []
for i,j,k,l in recover_train:
    init_train = traincntrcvr
    traincntrcvr+=l
    recover_train_list.append((i.split("_")[1],j,y_train[init_train:traincntrcvr],y_pred_train[init_train:traincntrcvr]))
    recover_train_E2.append((i.split("_")[1],j,np.sum(y_train[init_train:traincntrcvr]),np.sum(y_pred_train[init_train:traincntrcvr])))

In [None]:
y_train_1, y_pred_train_1 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_train_list if state=='1'])

y_test_1, y_pred_test_1 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_test_list if state=='1'])

y_train_2, y_pred_train_2 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_train_list if state=='2'])

y_test_2, y_pred_test_2 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_test_list if state=='2'])

y_train_3, y_pred_train_3 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_train_list if state=='3'])

y_test_3, y_pred_test_3 = np.hstack([np.vstack((true, pred)) for state, angle, true, pred, in recover_test_list if state=='3'])

In [None]:
traincorrE2 = pd.DataFrame(recover_train_E2).rename(columns={0:"state",1:'Angle ($^{\circ}$)',2:'E$_{2}^{\text{Calculated}}$',3:'E$_{2}^{\text{Predicted}}$'})
#.astype({"state":str,'Angle ($^{\circ}$)':float,'E$_{2}^{\text{Calculated}}$':float,'E$_{2}^{\text{Predicted}}$':float})
testcorrE2 = pd.DataFrame(recover_test_E2).rename(columns={0:"state",1:'Angle ($^{\circ}$)',2:'E$_{2}^{\text{Calculated}}$',3:'E$_{2}^{\text{Predicted}}$'})
#.astype({"state":str,'Angle ($^{\circ}$)':float,'E$_{2}^{\text{Calculated}}$':float,'E$_{2}^{\text{Predicted}}$':float})


traincorrE2['CASPT2_E']=np.zeros(len(traincorrE2))
traincorrE2['CASSCF_E']=np.zeros(len(traincorrE2))
traincorrE2['E2']=np.zeros(len(traincorrE2))

traincorrE2.index = ["_".join((u,v)) for u,v in traincorrE2[["state",'Angle ($^{\circ}$)']].values]

for i in traincorrE2.index:
    
    hn, r = i.split('_')
    
    energy_df = pd.read_excel(f"ANO-RCC-VTZP/ozone_{r}/ozone_{r}_energies.xlsx",index_col=0).loc[f"root_{hn}"]
    traincorrE2.loc[i,'CASPT2_E']=energy_df.loc['CASPT2_E']
    traincorrE2.loc[i,'CASSCF_E']=energy_df.loc['CASSCF_E']
    # traincorrE2.loc[i,'E2']=energy_df.loc['E2'].values

traincorrE2['E$_{\text{CASPT2}}^{\text{Predicted}}$'] = traincorrE2['CASSCF_E']+traincorrE2['E$_{2}^{\text{Predicted}}$']

traincorrE2.rename(columns = {'CASPT2_E':'E$_{\text{CASPT2}}^{\text{Calculated}}$','CASSCF_E':'E$_{\text{CASSCF}}^{\text{Calculated}}$'},inplace=True)


testcorrE2['CASPT2_E']=np.zeros(len(testcorrE2))
testcorrE2['CASSCF_E']=np.zeros(len(testcorrE2))
testcorrE2['E2']=np.zeros(len(testcorrE2))

testcorrE2.index = ["_".join((u,v)) for u,v in testcorrE2[["state",'Angle ($^{\circ}$)']].values]

for i in testcorrE2.index:
    hn, r = i.split('_')
    energy_df = pd.read_excel(f"ANO-RCC-VTZP/ozone_{r}/ozone_{r}_energies.xlsx",index_col=0).loc[f"root_{hn}"]
    testcorrE2.loc[i,'CASPT2_E']=energy_df.loc['CASPT2_E']
    testcorrE2.loc[i,'CASSCF_E']=energy_df.loc['CASSCF_E']
    # testcorrE2.loc[i,'E2']=energy_df.loc['E2'].values

testcorrE2['E$_{\text{CASPT2}}^{\text{Predicted}}$'] = testcorrE2['CASSCF_E']+testcorrE2['E$_{2}^{\text{Predicted}}$']

testcorrE2.rename(columns = {'CASPT2_E':'E$_{\text{CASPT2}}^{\text{Calculated}}$','CASSCF_E':'E$_{\text{CASSCF}}^{\text{Calculated}}$'},inplace=True)


traincorrE2['Set'] = len(traincorrE2)*['Train']
testcorrE2['Set'] = len(testcorrE2)*['Test']

In [None]:
traincaspt2melt = traincorrE2.melt(id_vars=["state",'Angle ($^{\circ}$)'], value_vars=['E$_{\text{CASPT2}}^{\text{Calculated}}$','E$_{\text{CASPT2}}^{\text{Predicted}}$']).astype({'Angle ($^{\circ}$)':float,'value':float})
testcaspt2melt = testcorrE2.melt(id_vars=["state",'Angle ($^{\circ}$)'], value_vars=['E$_{\text{CASPT2}}^{\text{Calculated}}$','E$_{\text{CASPT2}}^{\text{Predicted}}$']).astype({'Angle ($^{\circ}$)':float,'value':float})

In [None]:
traincaspt2melt

In [None]:
np.round((traincorrE2[traincorrE2['Angle ($^{\circ}$)'] == "116.75"].query("state == '2'")['E$_{\text{CASPT2}}^{\text{Predicted}}$'].values-traincorrE2[traincorrE2['Angle ($^{\circ}$)'] == "116.75"].query("state == '1'")['E$_{\text{CASPT2}}^{\text{Predicted}}$'].values) * 27.211407953,2)

In [None]:
np.round((traincorrE2[traincorrE2['Angle ($^{\circ}$)'] == "116.75"].query("state == '3'")['E$_{\text{CASPT2}}^{\text{Predicted}}$'].values-traincorrE2[traincorrE2['Angle ($^{\circ}$)'] == "116.75"].query("state == '1'")['E$_{\text{CASPT2}}^{\text{Predicted}}$'].values) * 27.211407953,2)

In [None]:
pairedcp=sns.color_palette('Paired')

In [None]:
dfstacked = pd.concat([traincorrE2,testcorrE2]).sort_values(by='Angle ($^{\circ}$)').astype({'Angle ($^{\circ}$)': float})

In [None]:
# dfstacked.dtypes

In [None]:
train_MS1 = dfstacked.query("state == '1'").query("Set == 'Train'")
train_MS2 = dfstacked.query("state == '2'").query("Set == 'Train'")
train_MS3 = dfstacked.query("state == '3'").query("Set == 'Train'")

test_MS1 = dfstacked.query("state == '1'").query("Set == 'Test'")
test_MS2 = dfstacked.query("state == '2'").query("Set == 'Test'")
test_MS3 = dfstacked.query("state == '3'").query("Set == 'Test'")

In [None]:
traincorrcalc1, traincorrpred1 = train_MS1['E$_{2}^{\text{Calculated}}$'],train_MS1['E$_{2}^{\text{Predicted}}$']
traincorrcalc2, traincorrpred2 = train_MS2['E$_{2}^{\text{Calculated}}$'],train_MS2['E$_{2}^{\text{Predicted}}$']
traincorrcalc3, traincorrpred3 = train_MS3['E$_{2}^{\text{Calculated}}$'],train_MS3['E$_{2}^{\text{Predicted}}$']

traincorrlabel1 = 'Root 1 '+f'MAE={1e3 * mean_absolute_error(traincorrcalc1, traincorrpred1):.4f}'+" mE$_{h}$"
traincorrlabel2 = 'Root 2 '+f'MAE={1e3 * mean_absolute_error(traincorrcalc2, traincorrpred2):.4f}'+" mE$_{h}$"
traincorrlabel3 = 'Root 3 '+f'MAE={1e3 * mean_absolute_error(traincorrcalc3, traincorrpred3):.4f}'+" mE$_{h}$"

testcorrcalc1, testcorrpred1 = test_MS1['E$_{2}^{\text{Calculated}}$'],test_MS1['E$_{2}^{\text{Predicted}}$']
testcorrcalc2, testcorrpred2 = test_MS2['E$_{2}^{\text{Calculated}}$'],test_MS2['E$_{2}^{\text{Predicted}}$']
testcorrcalc3, testcorrpred3 = test_MS3['E$_{2}^{\text{Calculated}}$'],test_MS3['E$_{2}^{\text{Predicted}}$']

testcorrlabel1 = 'Root 1 '+f'MAE={1e3 * mean_absolute_error(testcorrcalc1, testcorrpred1):.4f}'+" mE$_{h}$"
testcorrlabel2 = 'Root 2 '+f'MAE={1e3 * mean_absolute_error(testcorrcalc2, testcorrpred2):.4f}'+" mE$_{h}$"
testcorrlabel3 = 'Root 3 '+f'MAE={1e3 * mean_absolute_error(testcorrcalc3, testcorrpred3):.4f}'+" mE$_{h}$"

In [None]:
min_max = dfstacked[['E$_{2}^{\text{Predicted}}$','E$_{2}^{\text{Calculated}}$']].describe().loc[['min','max']]

In [None]:
mincorr = np.min(min_max) + np.min(min_max)*0.01
maxcorr = np.max(min_max) - np.max(min_max)*0.01

In [None]:
traincorrE2.loc[:,"E$_{\text{CASPT2}}^{\text{Calculated}}$"].idxmin()

# Vertical Excitation Energies from state 1 to state 2

In [None]:
((traincorrE2.loc['2_116.75',"E$_{\text{CASPT2}}^{\text{Calculated}}$"] - traincorrE2.loc['1_116.75',"E$_{\text{CASPT2}}^{\text{Calculated}}$"]))*27.211407953

In [None]:
((traincorrE2.loc['2_116.75',"E$_{\text{CASPT2}}^{\text{Predicted}}$"] - traincorrE2.loc['1_116.75',"E$_{\text{CASPT2}}^{\text{Predicted}}$"]))*27.211407953

# Vertical Excitation Energies from state 1 to state 3

In [None]:
((traincorrE2.loc['3_116.75',"E$_{\text{CASPT2}}^{\text{Calculated}}$"] - traincorrE2.loc['1_116.75',"E$_{\text{CASPT2}}^{\text{Calculated}}$"]))*27.211407953

In [None]:
((traincorrE2.loc['3_116.75',"E$_{\text{CASPT2}}^{\text{Predicted}}$"] - traincorrE2.loc['1_116.75',"E$_{\text{CASPT2}}^{\text{Predicted}}$"]))*27.211407953

In [None]:
pairedcp = sns.color_palette("Paired")
train_colors = pairedcp[0::2]
test_colors = pairedcp[1::2]

In [None]:
font=12
plt.rc('font', size=font)          # controls default text sizes
plt.rc('axes', titlesize=font)     # fontsize of the axes title
plt.rc('axes', labelsize=font)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=font)    # fontsize of the tick labels
plt.rc('ytick', labelsize=font)    # fontsize of the tick labels
plt.rc('legend', fontsize=font)    # legend fontsize
plt.rc('figure', titlesize=font)  # fontsize of the figure title

fig,((ax1,ax2),(ax3,ax4),(ax5,ax6))=plt.subplots(3,2,figsize=(15,20),sharey=False)



pred_train,true_train = dfstacked.query("Set == 'Train'")['E$_{2}^{\text{Predicted}}$'],dfstacked.query("Set == 'Train'")['E$_{2}^{\text{Calculated}}$']
pred_test,true_test = dfstacked.query("Set == 'Test'")['E$_{2}^{\text{Predicted}}$'],dfstacked.query("Set == 'Test'")['E$_{2}^{\text{Calculated}}$']
testrad = dfstacked.query("Set == 'Test'")['Angle ($^{\circ}$)']
trainrad = dfstacked.query("Set == 'Train'")['Angle ($^{\circ}$)']

ax1.scatter(1e3*y_train_1,1e3*y_pred_train_1,color=train_colors[0],edgecolors='k', label='Root 1\nR$^{2}$='+f'{r2_score(y_train_1, y_pred_train_1):.4f}\nMAE={1e3 * mean_absolute_error(y_train_1, y_pred_train_1):.4f}'+" mE$_{h}$")
ax1.scatter(1e3*y_train_2,1e3*y_pred_train_2,color=train_colors[1],edgecolors='k', label='Root 2\nR$^{2}$='+f'{r2_score(y_train_2, y_pred_train_2):.4f}\nMAE={1e3 * mean_absolute_error(y_train_2, y_pred_train_2):.4f}'+" mE$_{h}$")
ax1.scatter(1e3*y_train_3,1e3*y_pred_train_3,color=train_colors[2],edgecolors='k', label='Root 3\nR$^{2}$='+f'{r2_score(y_train_3, y_pred_train_3):.4f}\nMAE={1e3 * mean_absolute_error(y_train_3, y_pred_train_3):.4f}'+" mE$_{h}$")
ax1.plot(np.arange(-100,100),np.arange(-100,100),'k')
ax1.set_xlabel('Calculated CASPT2 Pair-Energies (mE$_{h}$)')
ax1.set_ylabel('Predicted CASPT2 Pair-Energies (mE$_{h}$)')
ax1.set_title('Train')
ax1.set_xlim(-35,35)
ax1.set_ylim(-35,35)
ax1.legend(loc=4)


ax2.scatter(1e3*y_test_1,1e3*y_pred_test_1,color=test_colors[0],edgecolors='k', label='Root 1\nR$^{2}$='+f'{r2_score(y_test_1, y_pred_test_1):.4f}\nMAE={1e3 * mean_absolute_error(y_test_1, y_pred_test_1):.4f}'+" mE$_{h}$")
ax2.scatter(1e3*y_test_2,1e3*y_pred_test_2,color=test_colors[1],edgecolors='k', label='Root 2\nR$^{2}$='+f'{r2_score(y_test_2, y_pred_test_2):.4f}\nMAE={1e3 * mean_absolute_error(y_test_2, y_pred_test_2):.4f}'+" mE$_{h}$")
ax2.scatter(1e3*y_test_3,1e3*y_pred_test_3,color=test_colors[2],edgecolors='k', label='Root 3\nR$^{2}$='+f'{r2_score(y_test_3, y_pred_test_3):.4f}\nMAE={1e3 * mean_absolute_error(y_test_3, y_pred_test_3):.4f}'+" mE$_{h}$")
ax2.plot(np.arange(-100,100),np.arange(-100,100),'k')
ax2.set_xlabel('Calculated CASPT2 Pair-Energies (mE$_{h}$)')
ax2.set_ylabel('Predicted CASPT2 Pair-Energies (mE$_{h}$)')
ax2.set_title('Test')
ax2.set_xlim(-35,35)
ax2.set_ylim(-35,35)
ax2.legend(loc=4)



sns.scatterplot(dfstacked.query("Set == 'Train'"),x='E$_{2}^{\text{Calculated}}$',y='E$_{2}^{\text{Predicted}}$',hue='state',palette=train_colors,edgecolors='k',ax=ax3)
ax3.set_xlabel('Calculated CASPT2 Correlation Energy (E$_{h}$)')
ax3.set_ylabel('Predicted CASPT2 Correlation Energy (E$_{h}$)')
# ax3.set_title('Train')
ax3.plot(np.arange(-5,5),np.arange(-5,5),'k')
handles, labels = ax3.get_legend_handles_labels()
ax3.legend(handles=handles, labels=[traincorrlabel1, traincorrlabel2, traincorrlabel3], loc=4)

sns.scatterplot(dfstacked.query("Set == 'Train'"),x='E$_{2}^{\text{Calculated}}$',y='E$_{2}^{\text{Predicted}}$',hue='state',palette=test_colors,edgecolors='k',ax=ax4)
ax4.plot(np.arange(-5,5),np.arange(-5,5),'k')
ax4.set_xlabel('Calculated CASPT2 Correlation Energy (E$_{h}$)')
ax4.set_ylabel('Predicted CASPT2 Correlation Energy (E$_{h}$)')
# ax4.set_title('Test')    
ax4.legend(loc=4)
handles, labels = ax4.get_legend_handles_labels()

ax4.legend(handles=handles,loc=4,labels=[testcorrlabel1, testcorrlabel2, testcorrlabel3])

bottom, top = -0.65,-0.5
ax3.set_xlim(bottom, top)
ax3.set_ylim(bottom, top)
ax4.set_xlim(bottom, top)
ax4.set_ylim(bottom, top)
ax3.set_xticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)])
ax3.set_yticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)])    
# ax4.set_xlim(bottom,top)
ax4.set_xticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)])    
# ax4.set_ylim(bottom,top)
ax4.set_yticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)])    


sns.scatterplot(dfstacked.query("Set == 'Train'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Predicted}}$',hue='state',palette=train_colors,edgecolors='k',ax=ax5)
sns.lineplot(dfstacked.query("Set == 'Train'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Calculated}}$',hue='state',palette=train_colors,ax=ax5)
sns.lineplot(dfstacked.query("Set == 'Train'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASSCF}}^{\text{Calculated}}$',hue='state',palette=train_colors,ax=ax5,linestyle='--')

ax5.set_xlabel('Angle ($^{\circ}$)')
ax5.set_ylabel('CASPT2 Energy (E$_{h}$)')
ax5.set_title('Train')
handles3, labels3 = ax5.get_legend_handles_labels()
# labels3 = [f"Predicted MS-CASPT2 (root {i})" for i in labels3[:3]]+[f"Calculated MS-CASPT2 (root {i})" for i in labels3[3:6]]+[f"SA-CASSCF (root {i})" for i in labels3[6:]]
labels3 = [f"Predicted MS-CASPT2 (root {i})" for i in labels3[:3]]+[f"SA-CASSCF (root {i})" for i in labels3[6:]]
handles3 = handles3[:3] + handles3[6:]
ax5.legend(handles=handles3, labels=labels3)

sns.scatterplot(dfstacked.query("Set == 'Test'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Predicted}}$',hue='state',palette=test_colors,edgecolors='k',ax=ax6)
sns.lineplot(dfstacked.query("Set == 'Test'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Calculated}}$',hue='state',palette=test_colors,ax=ax6)
sns.lineplot(dfstacked.query("Set == 'Test'"),x='Angle ($^{\circ}$)',y='E$_{\text{CASSCF}}^{\text{Calculated}}$',hue='state',palette=test_colors,ax=ax6,linestyle='--')
ax6.set_xlabel('Angle ($^{\circ}$)')
ax6.set_ylabel('CASPT2 Energy (E$_{h}$)')
ax6.set_title('Test') 

handles4, labels4 = ax6.get_legend_handles_labels()
# labels3 = [f"Predicted MS-CASPT2 (root {i})" for i in labels3[:3]]+[f"Calculated MS-CASPT2 (root {i})" for i in labels3[3:6]]+[f"SA-CASSCF (root {i})" for i in labels3[6:]]
labels4 = [f"Predicted MS-CASPT2 (root {i})" for i in labels4[:3]]+[f"SA-CASSCF (root {i})" for i in labels4[6:]]
handles4 = handles4[:3] + handles4[6:]

ax6.legend(handles=handles4, labels=labels4)

minpt2, maxpt2= pd.concat([pred_train,true_train,pred_test,true_test]).min(), pd.concat([pred_train,true_train,pred_test,true_test]).max()
print(minpt2, maxpt2)
pct=0.004
bufferbot, buffertop = minpt2*pct, maxpt2*pct



#mb
# bottom, top = -224.5, -224.1984 
#maxpt2-(buffertop) 

# vdzp: -224.40
# bottom, top = -225.2, -224.4

# vtzp
bottom, top = -225.4, -224.4
print(bottom, top)
ax5.set_ylim(bottom,top)
ax5.set_xlim(100,190)
ax5.set_yticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)]) 
ax5.set_xticks(np.linspace(100,190,4),[f"{i:.0f}" for i in np.linspace(100,190,4)]) 
ax6.set_ylim(bottom,top)
ax6.set_xlim(100,190)
ax6.set_yticks(np.linspace(bottom,top,4),[f"{i:.4f}" for i in np.linspace(bottom,top,4)]) 
ax6.set_xticks(np.linspace(100,190,4),[f"{i:.0f}" for i in np.linspace(100,190,4)]) 


fig.suptitle("O$_{3}$ MS-CASPT2/ANO-RCC-VTZP")

plt.subplots_adjust(wspace=0.05)
plt.tight_layout()
plt.savefig(f'images/ALL_MSCASPTCombinedozone.png',dpi=300,bbox_inches='tight')
plt.show()    