## Predicting molecules properties using biologial data

### Import the CoEPrA.csv

The dataset is obtained from the [CoEPrA Repository](http://CoEPrA.org)

In [1]:
#Import the libraries
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
filename = "CoEPrA.csv"
raw_data = open(filename, 'rt')
data = np.loadtxt(raw_data, delimiter=",")

In [3]:
data.shape

(89, 5788)

In [4]:
#We separate out the independent variable into X 
#and dependent variable into y
X=data[:,0:5787]
y=data[:,5787]

In [5]:
#We split the data into train and test using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(71, 5787)
(71,)
(18, 5787)
(18,)


### Linear Regression without Regularization

In [7]:
# Create linear regression object
LR = LinearRegression()
# Train the model using the training sets
LR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# Make predictions using the testing set
y_pred = LR.predict(X_train)

In [9]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred))

Mean squared error: 0.00


#### We get a zero training error

In [10]:
#K-Fold Cross validation 
scores = cross_val_score(LR, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [11]:
print(np.mean(scores))

-1.2942806730857016e+17


#### We get a really high mean squared error

In [12]:
# Make predictions using the testing set
y_pred_test = LR.predict(X_test)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred_test))

Mean squared error: 115262556605687904.00


#### We get a really high test error

### L1 / Lasso Regularization 

In [13]:
LR = linear_model.Lasso(alpha=0.3, max_iter=1000000)
LR.fit(X_train, y_train)

Lasso(alpha=0.3, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

#### Checking the weights

In [14]:
print(LR.coef_)

[-0.  0. -0. ... -0. -0.  0.]


#### Many coffecients become zero

In [15]:
#Index of all non zero coffecients 
index=np.nonzero(LR.coef_)
print(index[0])

[  64  136  445  451  653  715  760  787  858 1236 1358 1422 1430 1732
 1737 1874 1879 2065 2247 2374 2380 2581 2644 2689 2708 2890 3224 3351
 3666 3931 3994 4002 4221 4303 4510 4573 4574 4637 4645 4819 4952 5153
 5154 5280 5589 5595 5648 5732]


In [16]:
#New feature matrix with only selelcted features
X_train_filter=X_train[:,index[0]]

In [17]:
#New shape
X_train_filter.shape

(71, 48)

In [18]:
# Make predictions using the testing set
y_pred = LR.predict(X_train)

In [19]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred))

Mean squared error: 0.05


In [20]:
#K-Fold Cross validation 
scores = cross_val_score(LR, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [21]:
print(np.mean(scores))

-1.1615211159922432


In [22]:
# Make predictions using the testing set
y_pred_test = LR.predict(X_test)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred_test))

Mean squared error: 0.69


#### Overfitting has reduced

### L2 Ridge Regularization

In [23]:
#Using the filtered features we obtainied from L1
LR = linear_model.Ridge(alpha=0.8,max_iter=1000000)
LR.fit(X_train_filter, y_train)
# Make predictions using the testing set
y_pred = LR.predict(X_train_filter)

In [24]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred))

Mean squared error: 0.03


In [25]:
scores = cross_val_score(LR, X_train_filter, y_train, scoring='neg_mean_squared_error', cv=5)
print(np.mean(scores))

-1.2017669016770958


#### Cross validation values does not change much 

In [26]:
#Filtering the test features
X_test_filter=X_test[:,index[0]]

In [27]:
# Make predictions using the testing set
y_pred_test = LR.predict(X_test_filter)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred_test))

Mean squared error: 1.80
