# S2RSLDB: a comprehensive manually curated, internet-accessible database of the sigma-2 receptor selective ligands

### Imports

In [1]:
from smdt import datasets
from smdt import data_processing
from smdt import molecular_descriptors
import pandas as pd

### Data

In [2]:
Sigma2 = datasets.load_Sigma2ReceptorLigands()

Reference: 
G. Nastasi, C. Miceli, V. Pittala, M.N. Modica, O. Prezzavento, G. Romeo, A. Rescifina, A. Marrazzo, E. AmataS2RSLDB: a comprehensive manually curated, internet-accessible database of the sigma-2 receptor selective ligandsJ. Cheminform., 9 (2017), p. 3


### Descriptors

In [4]:
data = molecular_descriptors.getAllDescriptors(Sigma2)


Calculating Molecular Descriptors...
Row 651 out of 651
Calculating Molecular Descriptors Completed.


In [5]:
y = data['Target']
X = data.drop(['Target'], axis=1)

In [6]:
from sklearn.preprocessing import Imputer
a = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = a.fit_transform(X)

### Models

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, Imputer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from smdt import data_processing
from smdt import molecular_descriptors
from sklearn import metrics
import numpy as np
import pandas as pd

In [243]:
def fit_RandomForestRegressor(X, y, n_features):
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

    # Standardization
    a = StandardScaler()
    X_train = a.fit_transform(X_train)
    X_test = a.transform(X_test)
    
    # Feature Selection
    b = PCA(n_components=n_features)
    X_train = b.fit_transform(X_train, y_train)
    X_test = b.transform(X_test)

    # Grid Search CV
    clf = RandomForestRegressor()
    parameters = {'n_estimators': [10, 100], 'criterion': ['mse', 'mae'],
                  'max_features': ['auto', 'sqrt', 'log2'], 'oob_score': [True, False], 'verbose': [0]}
    grid = GridSearchCV(clf, parameters)
    grid.fit(X_train, y_train)

    # Metrics
    print('Training data GridSearchCV best r2 score: %.5f' % grid.best_score_)
    print('Testing Data Regression r2 score: %.5f' % grid.score(X_test, y_test))

    return grid

In [244]:
fit_RandomForestRegressor(X,y,10)

Training data GridSearchCV best r2 score: -0.39882
Testing Data Regression r2 score: 0.19376


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 100], 'criterion': ['mse', 'mae'], 'max_features': ['auto', 'sqrt', 'log2'], 'oob_score': [True, False], 'verbose': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
from sklearn.linear_model import LassoCV

In [210]:
def fit_Lasso(X, y, n_features):
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=26)

    # Standardization
    a = StandardScaler()
    X_train = a.fit_transform(X_train)
    X_test = a.transform(X_test)

    # Feature Selection
    b = PCA(n_components=n_features)
    X_train = b.fit_transform(X_train, y_train)
    X_test = b.transform(X_test)

    # Grid Search CV
    clf = LassoCV(cv=10)
    clf.fit(X_train, y_train)

    # Metrics
    print('Training data GridSearchCV best r2 score: %.5f' % clf.score(X_train, y_train))
    print('Testing Data Classification r2 score: %.5f' % clf.score(X_test, y_test))

    return clf


In [211]:
fit_Lasso(X,y,50)

Training data GridSearchCV best r2 score: 0.22391
Testing Data Classification r2 score: 0.25915


LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)