In [2]:
# import libraries

import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Scikit-learn is a free software machine learning library for the Python programming language.

In [3]:
# read data

data = pd.read_csv("Dataset.csv", index_col=0)

data.head()

Unnamed: 0,TPSA(Tot),SAacc,H-050,ML0GP,RDCHI,GATS1p,nN,C-040,Target-LC50
0,0.0,0.0,0,2.419,1.225,0.667,0,0,3.74
1,0.0,0.0,0,2.638,1.401,0.632,0,0,4.33
2,9.23,11.0,0,5.799,2.93,0.486,0,0,7.019
3,9.23,11.0,0,5.453,2.887,0.495,0,0,6.723
4,9.23,11.0,0,4.068,2.758,0.695,0,0,5.979


The dataset is split into two parts:

X - which represents the feature variables, and

y - which represents the target variable.

In [4]:
y = data['Target-LC50'].copy()

y.head()

0    3.740
1    4.330
2    7.019
3    6.723
4    5.979
Name: Target-LC50, dtype: float64

In [6]:
X = data.drop('Target-LC50', axis=1)

X.head()

Unnamed: 0,TPSA(Tot),SAacc,H-050,ML0GP,RDCHI,GATS1p,nN,C-040
0,0.0,0.0,0,2.419,1.225,0.667,0,0
1,0.0,0.0,0,2.638,1.401,0.632,0,0
2,9.23,11.0,0,5.799,2.93,0.486,0,0
3,9.23,11.0,0,5.453,2.887,0.495,0,0
4,9.23,11.0,0,4.068,2.758,0.695,0,0


# Scaling

StandardScaler() standardizes features by removing the mean and scaling to unit variance

The standard score of a sample x is calculated as:

    z = (x - u) / s

where u is the mean of the training samples and s is the standard deviation of the training samples.

In [7]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

# Splitting the dataset

into random train and test subsets.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
len(X_train)

382

In [11]:
len(X_test)

164

In [12]:
type(X_train)

numpy.ndarray

# Linear Regression


In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
# initialise the model

model = LinearRegression()

In [15]:
# fit the model

model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
# the coefficients of the linear model

model.coef_

array([ 1.38612304, -1.09355081,  0.04844696,  0.75855419,  0.42774178,
       -0.21274342, -0.33142602,  0.03024277])

In [16]:
# accuracy score

model.score(X_test, y_test)

0.49338277738255487

In [17]:
# predictions

model.predict(X_test)

array([5.19565032, 7.11168942, 2.76223632, 4.59039876, 3.6202144 ,
       5.21067124, 3.83335489, 6.19646177, 3.49397908, 4.97512099,
       6.91664367, 4.36221109, 4.98601492, 4.05963205, 1.8539353 ,
       4.2962094 , 4.97782502, 4.84482567, 4.4802414 , 4.32136769,
       5.76006607, 3.55423223, 2.91462598, 5.12666867, 3.89793931,
       3.88870414, 5.47962807, 5.52857494, 3.24964883, 4.01845497,
       5.1657068 , 5.54172837, 5.56581217, 4.76102693, 5.50315323,
       5.64915667, 4.75807226, 6.0539852 , 3.04711004, 3.10699005,
       4.21448302, 4.88745004, 4.53053696, 6.59594952, 4.44339369,
       5.82538709, 4.87224649, 5.76631862, 6.822445  , 4.68692341,
       4.16394466, 4.80348547, 3.83417454, 6.13343861, 4.99662216,
       4.53266117, 4.50214505, 5.66811111, 6.10050541, 6.49511316,
       5.73970293, 4.60861273, 3.79874276, 4.10552701, 4.46114661,
       5.24447796, 5.06683018, 3.60922694, 5.61480881, 3.96157231,
       4.06908406, 6.13244346, 5.21366785, 4.62088013, 6.12178

In [18]:
# Another accuracy measure

from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, model.predict(X_test))

1.3183190456388498