# Predicting Melting Points of Molecules

### Import

In [1]:
from smdt import datasets
from smdt import molecular_descriptors
from smdt import models
import pandas as pd

### Data Reading

In [26]:
data = datasets.melting_point()
print('n_rows: %d'%(data.shape[0]))
print('n_columns: %d'%(data.shape[1]))
print('column names: %s' %list(data.columns))
print(data.head())

n_rows: 4450
n_columns: 2
column names: ['SMILES', 'Target']
                                              SMILES  Target
0                                     O=C1Cc2ccccc21    14.0
1  Clc1ccc(cc1)C1c2c(OC(N)=C1C#N)[nH][nH0]c2C(F)(F)F    20.5
2                      O=C(OC)C(=Cc1ccccc1)Cc1ccccc1    27.5
3                         FC(F)(F)c1[nH0]cc2ccccc2c1    30.5
4                                O=C(OC1Cc2ccccc21)C    31.0


In [37]:
from rdkit import Chem
def smiles_validation(data):
    for i in data.index:
        try:
            a = Chem.MolFromSmiles(data['SMILES'][i])
            Chem.GetDistanceMatrix(a)
        except:
            data.drop(i,inplace=True)
    data.reset_index(inplace=True)
    return data

In [38]:
data = smiles_validation(data)

### Calculate Molecular Descriptors

In [39]:
data = molecular_descriptors.getAllDescriptors(data)
data.to_csv('Data_with_Molecular_Descriptors.csv', index=False)


Calculating Molecular Descriptors...
Row 4333 out of 4333
Calculating Molecular Descriptors Completed.


In [40]:
data = pd.read_csv('Data_with_Molecular_Descriptors.csv')
print('n_rows: %d'%(data.shape[0]))
print('n_columns: %d'%(data.shape[1]))
print(data.head())

n_rows: 4333
n_columns: 760
        W        AW         J         Xu      GMTI   Pol         DZ       Ipc  \
0    82.0  2.277778  2.634238   8.652965  2.568202   9.0  19.000000  2.157035   
1  1046.0  4.134387  2.257916  21.532542  3.640283  40.0  53.833333  5.213381   
2   742.0  4.339181  2.267054  19.046127  3.490801  24.0  40.000000  4.438837   
3   288.0  3.164835  2.627008  13.842646  3.076640  20.0  33.000000  3.208180   
4   203.0  3.075758  2.146253  12.143010  2.928908  13.0  26.000000  2.748027   

    BertzCT      Thara   ...    ATSe8  ATSp1  ATSp2  ATSp3  ATSp4  ATSp5  \
0  2.419997  20.200000   ...    0.000  2.347  2.477  2.187  1.363  0.375   
1  2.920743  86.101190   ...    2.652  3.044  3.387  3.444  3.413  3.332   
2  2.745333  58.915873   ...    2.398  2.963  3.183  3.103  2.987  3.027   
3  2.664754  39.761905   ...    0.000  2.581  2.841  2.760  2.318  1.877   
4  2.501565  30.352381   ...    0.000  2.515  2.641  2.515  2.177  1.851   

   ATSp6  ATSp7  ATSp8  Targ

In [43]:
from smdt import data_processing

In [44]:
data = data_processing.missing_value_imputation(data)

### Models

#### Linear Regressor

In [45]:
linear = models.linear_model(data, feature_selection='univariate')


Linear Regression

Found dataset of shape:  (4333, 760)

Data Split started...
Train data shape: (3249, 760)
Test data shape: (1084, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (3249, 11)
New test data shape: (1084, 11)
Feature selection completed.

Training Linear Model...
Training completed.

Model Validation on Test data:
{'mean squared error': 5222.975, 'r2': -0.0, 'mean absolute error': 58.0, 'explained r2': -0.0, 'mean squared log error': 0.0, 'median absolute error': 49.0}


#### Ridge Regressor

In [46]:
ridge = models.ridge_model(data, feature_selection='low variance')

Ridge Regression

Found dataset of shape:  (4333, 760)

Data Split started...
Train data shape: (3249, 760)
Test data shape: (1084, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (3249, 626)
New test data shape: (1084, 626)
Feature selection completed.

Started GridSearchCV on Training data...
GridSearchCV completed.

Best Estimator:
alpha: 10.0

Model Validation on Test data:
{'mean squared error': 6274.301, 'r2': -1.0, 'mean absolute error': 64.0, 'explained r2': -1.0, 'mean squared log error': 0.0, 'median absolute error': 56.0}


#### Lasso Regressor

In [47]:
lasso = models.lasso_model(data, feature_selection='low variance')

Lasso Regression

Found dataset of shape:  (4333, 760)

Data Split started...
Train data shape: (3249, 760)
Test data shape: (1084, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (3249, 626)
New test data shape: (1084, 626)
Feature selection completed.

Started GridSearchCV on Training data...
GridSearchCV completed.

Best Estimator:
alpha: 0.3563146315305976

Model Validation on Test data:
{'mean squared error': 6520.675, 'r2': -1.0, 'mean absolute error': 65.0, 'explained r2': -1.0, 'mean squared log error': 0.0, 'median absolute error': 54.0}


#### Random Forest Regressor

In [48]:
random_forest = models.random_forest_model(data, feature_selection='univariate', n_features=50)


Random Forest Regression

Found dataset of shape:  (4333, 760)

Data Split started...
Train data shape: (3249, 760)
Test data shape: (1084, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (3249, 51)
New test data shape: (1084, 51)
Feature selection completed.

GridSearchCV Parameter Grid:
{'n_estimators': [10, 50], 'criterion': ('mse', 'mae'), 'max_features': ('auto', 'sqrt', 'log2')}

Started GridSearchCV on Training data...
GridSearchCV completed.

Best Estimator:
parameters: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Cross-validation r2 score of the best estimator: 0.466

Mode

#### Extra Trees Regressor

In [None]:
extra_trees = models.extra_trees_model(data,feature_selection='univariate', n_features=50)

#### Gradient Boosting Regressor

In [None]:
gradient_boosting = models.gradient_boosting_model(data, feature_selection='univariate', n_features=50)

#### SGD Regressor

In [50]:
sgd = models.sgd_model(data, feature_selection='univariate', n_features=10)


SGD Regression

Found dataset of shape:  (4333, 760)

Data Split started...
Train data shape: (3249, 760)
Test data shape: (1084, 760)
Data Split completed.

Data Scaling started...
Data Scaling completed.

Selecting Features...
New train data shape: (3249, 11)
New test data shape: (1084, 11)
Feature selection completed.

GridSearchCV Parameter Grid:
{'loss': ('squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'), 'penalty': ('l1', 'l2', 'elasticnet', 'none')}

Started GridSearchCV on Training data...
GridSearchCV completed.

Best Estimator:
parameters: SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None,
       penalty='elasticnet', power_t=0.25, random_state=None, shuffle=True,
       tol=None, verbose=0, warm_start=False)
Cross-validation r2 score of the best estimator: 0.280

Model Validation on Test data:
{'mean square

#### Support Vector Regressor

In [None]:
svr = models.svr_model(data, feature_selection='tree based')