# USP Inhibition

### Imports

In [1]:
from smdt import datasets
from smdt import data_processing
from smdt import molecular_descriptors
from smdt import regression
import pandas as pd
import numpy as np

### Data

#### Reading Data

In [2]:
USP = datasets.load_UspInhibition()

#### Calculating Descriptors

- Calculated descriptors separately on a remote Linux machine and save it as a CSV
- Importing the file below (Note: File not on GitHub as file size is 2.02 GB)

In [3]:
data = pd.read_csv('USP_All_Descriptors.csv')

#### Data Cleaning

In [4]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

In [5]:
y = USP['Target']
y.replace([np.inf, -np.inf], np.nan, inplace=True)
y.fillna(0, inplace=True)
X = data

In [6]:
from sklearn.preprocessing import Imputer
a = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = a.fit_transform(X)

#### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Models

#### Random Forest

In [None]:
model1 = regression.fit_RandomForestRegressor(X_train, X_test, y_train, y_test, 10)

#### Lasso

In [8]:
model2 = regression.fit_Lasso(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.03330
Testing Data Classification r2 score: 0.02838


#### Ridge

In [10]:
model3 = regression.fit_Ridge(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.03481
Testing Data Classification r2 score: 0.02824


#### ElasticNet

In [None]:
model4 = regression.fit_ElasticNet(X_train, X_test, y_train, y_test)

Training data GridSearchCV best r2 score: 0.03321
Testing Data Classification r2 score: 0.02841


#### Linear SVR

In [None]:
model5 = regression.fit_LinearSVR(X_train, X_test, y_train, y_test, 10)