# Predicting Melting Points of Molecules

### Import

In [1]:
from smdt import datasets
from smdt import data_processing
from smdt import molecular_descriptors
import pandas as pd

### Data Reading

In [2]:
data = datasets.melting_point()[:20]
print('n_rows: %d'%(data.shape[0]))
print('n_columns: %d'%(data.shape[1]))
print('column names: %s' %list(data.columns))
print(data.head())

n_rows: 20
n_columns: 2
column names: ['SMILES', 'Target']
                                              SMILES  Target
0                                     O=C1Cc2ccccc21    14.0
1  Clc1ccc(cc1)C1c2c(OC(N)=C1C#N)[nH][nH0]c2C(F)(F)F    20.5
2                      O=C(OC)C(=Cc1ccccc1)Cc1ccccc1    27.5
3                         FC(F)(F)c1[nH0]cc2ccccc2c1    30.5
4                                O=C(OC1Cc2ccccc21)C    31.0


### Calculate Molecular Descriptors

In [4]:
data = molecular_descriptors.getAllDescriptors(data)
data.to_csv('Data_with_Molecular_Descriptors.csv', index=False)


Calculating Molecular Descriptors...
Row 20 out of 20
Calculating Molecular Descriptors Completed.


In [4]:
data = pd.read_csv('Data_with_Molecular_Descriptors.csv')
print('n_rows: %d'%(data.shape[0]))
print('n_columns: %d'%(data.shape[1]))
print(data.head())

n_rows: 20
n_columns: 760
        W        AW         J         Xu      GMTI   Pol         DZ       Ipc  \
0    82.0  2.277778  2.634238   8.652965  2.568202   9.0  19.000000  2.157035   
1  1046.0  4.134387  2.257916  21.532542  3.640283  40.0  53.833333  5.213381   
2   742.0  4.339181  2.267054  19.046127  3.490801  24.0  40.000000  4.438837   
3   288.0  3.164835  2.627008  13.842646  3.076640  20.0  33.000000  3.208180   
4   203.0  3.075758  2.146253  12.143010  2.928908  13.0  26.000000  2.748027   

    BertzCT      Thara   ...    ATSe8  ATSp1  ATSp2  ATSp3  ATSp4  ATSp5  \
0  2.419997  20.200000   ...    0.000  2.347  2.477  2.187  1.363  0.375   
1  2.920743  86.101190   ...    2.652  3.044  3.387  3.444  3.413  3.332   
2  2.745333  58.915873   ...    2.398  2.963  3.183  3.103  2.987  3.027   
3  2.664754  39.761905   ...    0.000  2.581  2.841  2.760  2.318  1.877   
4  2.501565  30.352381   ...    0.000  2.515  2.641  2.515  2.177  1.851   

   ATSp6  ATSp7  ATSp8  Target

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from smdt import utils
from sklearn.svm import SVR
import numpy as np

def svr_model(data, standardize = True, feature_selection = 'univariate', n_features = 10, cv_metric = 'r2'):
    """
    Train a Support Vector Regression model
    Papameters:
        data: pandas.DataFrame
            Descriptor and Target data
        standardize: boolean, default True
            Scales features to zero mean and unit variance
        feature_selection: str, default 'univariate feature selection'
            Specify strategy for feature selection. Choose between 'remove_low_variance_features','univariate feature selection',
            'tree_based_feature_selection'
        n_features: int, default 10
            Used only in univariate feature selection
        cv_metric: str, default 'r2'
    """
    print('Found dataset of shape:  %s' %str(data.shape))
    print('\nData Split started...')
    train, test = utils.test_train_split(data)
    print('Train data shape: %s' %str(train.shape))
    print('Test data shape: %s' %str(test.shape))
    print('Data Split completed.')

    if standardize == True:
        print('\nData Scaling started...')    
        train, test = data_processing.data_standardization(train, test)
        print('Data Scaling completed.')    

    print('\nSelecting Features...')
    if feature_selection == 'low variance':
        train = data_processing.remove_low_variance_features(train)
        test = test[train.columns]
    elif feature_selection == 'univariate':
        train = data_processing.univariate_feature_selection(train,n_features)
        test = test[train.columns]
    elif feature_selection == 'tree based':
        train = data_processing.tree_based_feature_selection(train)
        test = test[train.columns]
    print('New train data shape: %s' %str(train.shape))
    print('New test data shape: %s' %str(test.shape))
    print('Feature selection completed.')

    train_descriptors, train_target = utils.descriptor_target_split(train)
    test_descriptors, test_target = utils.descriptor_target_split(test)
    
    parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'epsilon' : [0.1, 1], 'C': [1e0, 1e1, 1e2, 1e3],
                               'gamma': np.logspace(-2, 2, 5)}
    model = SVR()
    print('\nGridSearchCV Parameter Grid:')
    print(parameters)
    print('\nStarted GridSearchCV on Training data...')
    clf = GridSearchCV(model, parameters,scoring=cv_metric,cv=10,refit=True).fit(np.array(train_descriptors),train_target.values.ravel())
    print('GridSearchCV completed.')

    print('\nBest Estimator:')
    print('parameters: %s'%clf.best_estimator_)
    print('Mean cross-validated %s score of the best estimator: %.3f'%(cv_metric,clf.best_score_))
    print('\nModel Validation on Test data:')
    y_pred = clf.predict(test_descriptors)
    
    metric = {}
    metric['mean squared error'] = round(metrics.regression.mean_squared_error(test_target, y_pred),3)
    metric['r2'] = round(metrics.regression.r2_score(test_target, y_pred))
    metric['mean absolute error'] = round(metrics.regression.mean_absolute_error(test_target, y_pred))
    metric['explained r2'] = round(metrics.regression.explained_variance_score(test_target, y_pred))
    metric['mean squared log error'] = round(metrics.regression.mean_squared_log_error(test_target, y_pred))
    metric['median absolute error'] = round(metrics.regression.median_absolute_error(test_target, y_pred))

    print(metric)
    return clf, metric, list(train_descriptors.columns)

In [19]:
a = svr_model(data, cv_metric='r2', feature_selection='tree based')

Found dataset of shape:  (20, 760)

Data Split started...
Train data shape: (15, 760)
Test data shape: (5, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (15, 40)
New test data shape: (5, 40)
Feature selection completed.

GridSearchCV Parameter Grid:
{'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'epsilon': [0.1, 1], 'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])}

Started GridSearchCV on Training data...
GridSearchCV completed.

Best Estimator:
parameters: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=100.0,
  kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
Mean cross-validated r2 score of the best estimator: -9.009

Model Validation on Test data:
{'mean squared error': 18.533, 'r2': -0.0, 'mean absolute error': 3.0, 'explained r2': -0.0, 'mean squared log error': 0.0, 'median absolute error': 3.0}
