In [14]:
from __future__ import print_function

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [23]:
df = pd.read_csv("combined-entropy.csv")
# drop nan
df.dropna(inplace=True)
# 93543 molecules left
#  (technically we should check to make sure all the SMILES are unique!)
df['logRotors'] = np.log(df['NumRotors'] + 1)
df['logMethyl'] = np.log(df['NumMethyl'] + 1)

In [17]:
df.describe()

Unnamed: 0,Entropy,NumRotors,NumMethyl,NumAmine,NumHydroxyl,HDonors,HAcceptors,RingCount,NumAromaticRings,ExactMolWt,...,Eccentricity,InertialShapeFactor,RadiusOfGyration,SpherocityIndex,ConfUnder1,ConfUnder2,ConfUnder3,ConfUnder4,ConfUnder5,ConfUnder6
count,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,...,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0,93543.0
mean,34.453231,4.574859,2.32322,0.137178,0.842457,1.531574,5.129416,3.950921,2.308682,408.313212,...,0.947388,0.000902,4.294947,0.172407,7.395775,19.505575,36.974995,58.744599,83.798104,111.117967
std,11.851837,2.636964,2.04187,0.437735,1.725989,1.756476,2.880881,1.74043,1.580702,113.309995,...,0.058016,0.001575,0.946011,0.132832,11.475761,31.780216,60.248477,95.155245,135.344965,177.385966
min,0.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0626,...,0.169475,1.3e-05,1.016678,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,27.086,3.0,1.0,0.0,0.0,0.0,3.0,3.0,1.0,333.157623,...,0.924252,0.00033,3.642106,0.081008,2.0,4.0,6.0,8.0,10.0,11.0
50%,36.077,4.0,2.0,0.0,0.0,1.0,5.0,4.0,2.0,420.178418,...,0.967986,0.000567,4.279916,0.140211,4.0,10.0,17.0,26.0,36.0,46.0
75%,42.9925,6.0,3.0,0.0,1.0,2.0,7.0,5.0,3.0,474.095082,...,0.988329,0.000973,4.919895,0.229202,9.0,23.0,44.0,71.0,102.0,138.0
max,68.287,20.0,26.0,9.0,14.0,15.0,30.0,64.0,58.0,2039.240573,...,0.999999,0.16228,12.0509,0.989008,495.0,981.0,1445.0,1957.0,3171.0,4437.0


In [24]:
df.columns

Index(['Category', 'File', 'SMILES', 'Entropy', 'NumRotors', 'NumMethyl',
       'NumAmine', 'NumHydroxyl', 'HDonors', 'HAcceptors', 'RingCount',
       'NumAromaticRings', 'ExactMolWt', 'MaxAbsPartialChg',
       'MinAbsPartialChg', 'MaxPartialChg', 'MinPartialChg', 'TPSA',
       'LabuteASA', 'MolMR', 'MolLogP', 'EState_VSA1', 'EState_VSA2',
       'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'HallKierAlpha', 'BertzCT',
       'BalabanJ', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'FractionCSP3',
       'Asphericity', 'Eccentricity', 'InertialShapeFactor',
       'RadiusOfGyration', 'SpherocityIndex', 'ConfUnder1', 'ConfUnder2',
       'ConfUnder3', 'ConfUnder4', 'ConfUnder5', 'ConfUnder6', 'logRotors',
       'logMethyl'],
      dtype='object')

In [25]:
formula = 'Entropy~' + '+'.join(df.columns[4:])
print(formula)

Entropy~NumRotors+NumMethyl+NumAmine+NumHydroxyl+HDonors+HAcceptors+RingCount+NumAromaticRings+ExactMolWt+MaxAbsPartialChg+MinAbsPartialChg+MaxPartialChg+MinPartialChg+TPSA+LabuteASA+MolMR+MolLogP+EState_VSA1+EState_VSA2+EState_VSA3+EState_VSA4+EState_VSA5+HallKierAlpha+BertzCT+BalabanJ+Ipc+Kappa1+Kappa2+Kappa3+FractionCSP3+Asphericity+Eccentricity+InertialShapeFactor+RadiusOfGyration+SpherocityIndex+ConfUnder1+ConfUnder2+ConfUnder3+ConfUnder4+ConfUnder5+ConfUnder6+logRotors+logMethyl


In [26]:
lm = ols('Entropy~NumRotors+NumMethyl', df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                Entropy   R-squared:                       0.589
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                 6.690e+04
Date:                Tue, 01 Oct 2019   Prob (F-statistic):               0.00
Time:                        16:48:15   Log-Likelihood:            -3.2248e+05
No. Observations:               93543   AIC:                         6.450e+05
Df Residuals:                   93540   BIC:                         6.450e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     19.5442      0.055    358.530      0.0

In [27]:
lm = ols('Entropy~logRotors+logMethyl', df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                Entropy   R-squared:                       0.671
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                 9.556e+04
Date:                Tue, 01 Oct 2019   Prob (F-statistic):               0.00
Time:                        16:48:39   Log-Likelihood:            -3.1196e+05
No. Observations:               93543   AIC:                         6.239e+05
Df Residuals:                   93540   BIC:                         6.240e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.9512      0.074    148.634      0.0