In [44]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

from pandas import set_option
pd.options.mode.chained_assignment = None

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from scipy.stats import truncnorm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [45]:
def ML_train(X_tr, y_tr, X_te, y_te):
    rstate = random.randrange(1,999999999,1)
    estimator = RandomForestRegressor(random_state = rstate, n_estimators = 10, n_jobs = 12)
    train = estimator.fit(X_tr, y_tr)
    pred = estimator.predict(X_te)
    
    # Cross validation 
    cvscore = cross_val_score(estimator, X_tr, y_tr, cv=5)
    score = cvscore.mean()
    print("Score with the entire dataset = %.2f" % score)
    
    print(mean_absolute_error(y_te, pred))
    print(mean_squared_error(y_te,pred))
    print(r2_score(y_te, pred))

In [2]:
charge_csv = pd.read_csv("cal.iRESP_good.dat", header=None, sep=' ')
names = np.asarray(charge_csv[0])
y = np.asarray(charge_csv[2])

In [3]:
import glob
from ase.io import read

loops = glob.glob("structures/*.pdb")
structures = list()
for loop in loops:
    name = loop.split('/')[1].split('.pdb')[0]
    if name in names:
        structure = read(loop)
        hvy_atoms = structure[structure.get_atomic_numbers()>1]
        structures.append(hvy_atoms)

In [4]:
j = list()
for i in structures:
    j.append(i.get_global_number_of_atoms())
n_max=sorted(j)[-1]

In [5]:
from dscribe.descriptors import CoulombMatrix
cm = CoulombMatrix(n_atoms_max=n_max)
X = cm.create(structures)

In [6]:
print(X.shape, y.shape)

(7817, 11664) (7817,)


In [7]:
from sklearn.ensemble import RandomForestRegressor
testsize = 0.3
rstate = random.randrange(1,999999999,1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = testsize, random_state = rstate)

In [48]:
X_train_total_size = X_train.shape[0]
chunk = 10
chunk_size = int(X_train_total_size / chunk)
for i in range(chunk):
    row = 1 + i*chunk_size + chunk_size
    sub_X_train = X_train[0:row,:]
    sub_y_train = y_train[0:row]
    ML_train(sub_X_train,sub_y_train,X_test,y_test)

Score with the entire dataset = -0.34
0.09109542536942314
0.01915143166274482
-0.11619572242558873
Score with the entire dataset = -0.25
0.09249389693703568
0.019394609305074348
-0.1303687539220162
Score with the entire dataset = -0.18
0.09287427930689184
0.019456138289563895
-0.1339548246921729
Score with the entire dataset = -0.21
0.0933280816147715
0.019497479931578493
-0.13636432413782051
Score with the entire dataset = -0.14
0.09347206263954858
0.019568876557791985
-0.14052550697670707
Score with the entire dataset = -0.15
0.0942183119815559
0.020233642831138918
-0.17926983083663384
Score with the entire dataset = -0.17
0.0947201664604577
0.020312354597867992
-0.18385735927183
Score with the entire dataset = -0.21
0.09482898691626408
0.02037337718548215
-0.1874139159024637
Score with the entire dataset = -0.16
0.09423634565420372
0.019701417909284022
-0.1482503649499991
Score with the entire dataset = -0.19
0.09302383329458265
0.019769009904341397
-0.1521898038954448


In [None]:
rstate = random.randrange(1,999999999,1)
estimator = RandomForestRegressor(random_state = rstate, n_estimators = 10, n_jobs = 12)
train = estimator.fit(X_train, y_train)
pred = estimator.predict(X_test)

In [None]:
# Cross validation 
from sklearn.model_selection import cross_val_score
cvscore = cross_val_score(estimator, X_train, y_train, cv=5)
score = cvscore.mean()
print("Score with the entire dataset = %.2f" % score)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
print(mean_absolute_error(y_test, pred))
print(mean_squared_error(y_test,pred))
print(r2_score(y_test, pred))

In [None]:
# Plot outputs
import pylab as pl
fig = pl.figure()
ax = fig.add_subplot(111)
ax.set_aspect('equal', adjustable='box')
pl.plot(y_test, pred,'ro',fillstyle='none')
pl.xlim(1,2.3)
pl.ylim(1,2.3)
pl.plot([1,3],[1,3], '--b')
pl.xlabel('i-RESP charge')
pl.ylabel('ML charge')
pl.show()

In [None]:
data_size = len(y_test)
fig = pl.figure()
ax = fig.add_subplot(111)
ax.set_xlim(0, int(data_size*testsize +3))
ax.set_ylim(1, 3)
pl.plot(y_test, color='red', label = 'i-RESP charge')
pl.plot(pred, color='blue', label = 'ML charge')
pl.legend(loc='best')
pl.show()

In [None]:
data_size = len(y_train)
fig = pl.figure()
ax = fig.add_subplot(111)
ax.set_xlim(0, int(data_size))
ax.set_ylim(1, 3)
pl.plot(y_train, color='red', label = 'i-RESP charge')
pl.plot(estimator.predict(X_train), color='blue', label = 'ML charge')
pl.legend(loc='best')
pl.show()