# Predicting Melting Points of Molecules

### Imports

In [1]:
from smdt import datasets
from smdt import molecular_descriptors
from smdt import regression
import pandas as pd

### Data

#### Data Reading

In [2]:
MeltingPoint = datasets.load_MeltingPoint()

References: Karthikeyan, M.; Glen, R.C.; Bender, A. General melting point prediction based on a diverse compound dataset and artificial neural networks. J. Chem. Inf. Model.; 2005; 45(3); 581-590


#### Calculating Descriptors

In [8]:
data = molecular_descriptors.getAllDescriptors(MeltingPoint)


Calculating Molecular Descriptors...
Row 4333 out of 4333
Calculating Molecular Descriptors Completed.


#### Data Cleaning

In [7]:
y = data['Target']
X = data.drop(['Target'], axis=1)

In [8]:
from sklearn.preprocessing import Imputer
a = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = a.fit_transform(X)

#### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Models

#### Random Forest

In [10]:
model1 = regression.fit_RandomForestRegressor(X_train, X_test, y_train, y_test, 10)

Training data GridSearchCV best r2 score: 0.41508
Testing Data Regression r2 score: 0.44528


#### Lasso

In [13]:
model2 = regression.fit_Lasso(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.56873
Testing Data Classification r2 score: 0.55357


#### Ridge

In [14]:
model3 = regression.fit_Ridge(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.63118
Testing Data Classification r2 score: 0.52908


#### ElasticNet

In [15]:
model4 = regression.fit_ElasticNet(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.57624
Testing Data Classification r2 score: 0.55558


#### Linear SVR

In [16]:
model5 = regression.fit_LinearSVR(X_train, X_test, y_train, y_test, 100)

Training data GridSearchCV best r2 score: 0.43898
Testing Data Classification r2 score: 0.50004
