In [None]:
# For data generation
import sys
# !{sys.executable} -m pip install matplotlib --upgrade
import shutil
import os
import numpy as np
from glob import glob
import re
import pandas as pd
import pickle
#random
from time import perf_counter

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,root_mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV


#Plotting
import seaborn as sns
sns.set_style()
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi'] = 200

import warnings
# warnings.filterwarnings("ignore", category=np.DeprecationWarning) 


from tqdm.auto import tqdm
# from tqdm.notebook import tqdm_notebook as tqdm

In [None]:
help(pd.melt)

In [None]:
energies = []
for i in glob("ANO-RCC-VTZP/ozone_*/*_energies.xlsx"):
    df = pd.read_excel(i).rename(columns={'Unnamed: 0':'Root','CASSCF_E':'CASSCF', 'CASPT2_E':'CASPT2'})
    df['Root'] = [int(i.split('_')[1]) for i in df['Root']]
    df['Angle ($^{\circ}$)'] = float(os.path.basename(i).split('_')[1])
    df = df.melt(id_vars=['Angle ($^{\circ}$)','Root'],value_vars=['CASSCF', 'CASPT2'])
    energies.append(df)

In [None]:
df_energies = pd.concat(energies)

In [None]:
with open('big_test_ind.pickle', 'rb') as handle:
    test_ind = pickle.load(handle)

with open('big_train_ind.pickle', 'rb') as handle:
    train_ind = pickle.load(handle)
# train_ind = train_ind[train_ind>=110]
# test_ind = test_ind[test_ind>=110]    
train_ind = list(map(float,train_ind))
test_ind = list(map(float,test_ind))
# train_ind, test_ind = train_test_split(train_ind+test_ind,test_size=0.1, random_state=42)
print(len(train_ind),len(test_ind))   

In [None]:
train_MS1 = pd.read_excel('train_ozone_VTZP_MS_1_energies.xlsx')
test_MS1 = pd.read_excel('test_ozone_VTZP_MS_1_energies.xlsx')

train_MS2 = pd.read_excel('train_ozone_VTZP_MS_2_energies.xlsx')
test_MS2 = pd.read_excel('test_ozone_VTZP_MS_2_energies.xlsx')

train_MS3 = pd.read_excel('train_ozone_VTZP_MS_3_energies.xlsx')
test_MS3 = pd.read_excel('test_ozone_VTZP_MS_3_energies.xlsx')

train_MS1['Set'] = len(train_MS1)*['Train']
test_MS1['Set'] = len(test_MS1)*['Test']
train_MS1['Root'] = len(train_MS1)*[1]
test_MS1['Root'] = len(test_MS1)*[1]

train_MS2['Set'] = len(train_MS2)*['Train']
test_MS2['Set'] = len(test_MS2)*['Test']
train_MS2['Root'] = len(train_MS2)*[2]
test_MS2['Root'] = len(test_MS2)*[2]

train_MS3['Set'] = len(train_MS3)*['Train']
test_MS3['Set'] = len(test_MS3)*['Test']
train_MS3['Root'] = len(train_MS3)*[3]
test_MS3['Root'] = len(test_MS3)*[3]

In [None]:
dfstacked = pd.concat([train_MS1,test_MS1,train_MS2,test_MS2,train_MS3,train_MS3])

In [None]:
# stackeddf = pd.concat([pd.read_excel(i).rename(columns={'Unnamed: 0':"root"}) for i in glob('ANO-RCC-VTZP/ozone_*/*_energies.xlsx')])

In [None]:
df_energies.query("variable == 'CASPT2'")

In [None]:
sns.color_palette('Paired',6)

In [None]:
dfstacked.columns

In [None]:
sns.jointplot(dfstacked,x='E$_{\text{CASPT2}}^{\text{Calculated}}$',y='E$_{\text{CASPT2}}^{\text{Predicted}}$',hue='Root',palette=sns.color_palette('Paired',3))


In [None]:
sns.scatterplot(dfstacked,x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Predicted}}$',hue='Root',style='Set',palette=sns.color_palette('Paired',3))



# sns.lineplot(MS1,x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Calculated}}$')
# sns.lineplot(MS2,x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Calculated}}$')
# sns.lineplot(MS3,x='Angle ($^{\circ}$)',y='E$_{\text{CASPT2}}^{\text{Calculated}}$')
sns.lineplot(df_energies.query("variable == 'CASPT2'"),x='Angle ($^{\circ}$)',y='value',hue='Root',palette=sns.color_palette('Paired',3))
plt.ylabel(r'E$_{\text{CASPT2}}^{\text{Predicted}}$')