# Theory
$$
X = \begin{bmatrix}
    1 & x_{11} & x_{12} & \dots & x_{1n} \\
    1 & x_{21} & x_{22} & \dots & x_{2n} \\
    \vdots & \vdots & \vdots & \vdots & \vdots \\
    1 & x_{m1} & x_{m2} & \dots & x_{mn}
\end{bmatrix}_{(m, n+1)}\text{has}\ m \ \text{datapoints and}\ n+1 \ \text{features}
$$
$$
Y = \begin{bmatrix}
y_1 \\ y_2 \\ \vdots \\ y_m
\end{bmatrix}_{(m, 1)} \text{has}\ m \ \text{datapoints and 1 feature and } \ \
\theta = \begin{bmatrix}
\theta_1 \\ \theta_2 \\ \vdots \\ \theta_{n+1}
\end{bmatrix}_{(n+1, 1)} \text{has}\ n+1 \ \text{datapoints and 1 feature}
$$
$$
\hat{Y} = \begin{bmatrix}
\hat{y}_1 \\ \hat{y}_2 \\ \vdots \\ \hat{y}_m
\end{bmatrix}_{(m, 1)} = \begin{bmatrix}
    1 & x_{11} & x_{12} & \dots & x_{1n} \\
    1 & x_{21} & x_{22} & \dots & x_{2n} \\
    \vdots & \vdots & \vdots & \vdots & \vdots \\
    1 & x_{m1} & x_{m2} & \dots & x_{mn}
\end{bmatrix}_{(m, n+1)}\begin{bmatrix}
\theta_1 \\ \theta_2 \\ \vdots \\ \theta_{n+1}
\end{bmatrix}_{(n+1, 1)}
$$
## Cost function
$$
\text{Cost} = \frac{1}{2m}\sum_{i=1}^m(y_i - \hat{y}_i)^2
$$
## Gradient Descent
$$
\theta = \theta + \alpha\times\delta\theta
$$
$$
\delta\theta_{(n+1,1)} = \frac{1}{m}X^T_{(n+1,m)}\left(Y-\hat{Y}\right)_{(m,1)}
$$
## Points to note
- $m=$ number of data-points.
- $\theta \ $ is our machine learning model.
- $Y$ is the set of actual values.
- $\hat{Y}$ is the set of predicted values.
- $\alpha=$ learning rate.

We initialize the value of $\theta = [0,0, ... ,0]$ and keep iterating for around 100-1000 times.
***
# Loading libraries and Dataframe

In [1]:
import numpy as np
import pandas as pd

In [2]:
solubility_data = pd.read_csv("./solubility_data.csv")
solubility_data

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
0,2.59540,167.850,0.0,0.000000,-2.180
1,2.37650,133.405,0.0,0.000000,-2.000
2,2.59380,167.850,1.0,0.000000,-1.740
3,2.02890,133.405,1.0,0.000000,-1.480
4,2.91890,187.375,1.0,0.000000,-3.040
...,...,...,...,...,...
1139,1.98820,287.343,8.0,0.000000,1.144
1140,3.42130,286.114,2.0,0.333333,-4.925
1141,3.60960,308.333,4.0,0.695652,-3.893
1142,2.56214,354.815,3.0,0.521739,-3.790


# Training Data

In [3]:
solubility_data_train = solubility_data.iloc[:100]
solubility_data_train

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
0,2.59540,167.850,0.0,0.000000,-2.180
1,2.37650,133.405,0.0,0.000000,-2.000
2,2.59380,167.850,1.0,0.000000,-1.740
3,2.02890,133.405,1.0,0.000000,-1.480
4,2.91890,187.375,1.0,0.000000,-3.040
...,...,...,...,...,...
95,3.14822,142.201,0.0,0.909091,-3.700
96,4.30142,192.261,0.0,0.933333,-5.850
97,-0.92640,126.115,0.0,0.666667,-0.807
98,2.54540,144.173,0.0,0.909091,-2.220


# Test Data

In [4]:
solubility_data_test = solubility_data.iloc[100:200]
solubility_data_test

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
100,2.7480,173.171,1.0,0.769231,-3.54
101,0.6731,89.094,2.0,0.000000,-0.80
102,2.7293,144.258,7.0,0.000000,-3.01
103,3.5329,126.243,6.0,0.000000,-5.05
104,2.9801,124.227,5.0,0.000000,-4.24
...,...,...,...,...,...
195,0.9854,72.107,1.0,0.000000,0.52
196,0.7614,70.091,1.0,0.000000,0.32
197,0.7954,118.176,5.0,0.000000,-0.42
198,2.4138,106.596,1.0,0.000000,-2.51


# Extracting dependent and independent features

In [5]:
dependent_feature = np.array(solubility_data_train['logS'])
dependent_feature

array([-2.18 , -2.   , -1.74 , -1.48 , -3.04 , -1.29 , -1.64 , -0.43 ,
       -4.57 , -4.37 , -4.63 , -4.   , -3.2  , -6.98 , -5.56 , -4.59 ,
       -4.5  , -3.59 , -3.31 ,  0.62 , -3.5  , -1.68 , -3.05 , -1.06 ,
       -1.6  , -2.74 , -0.77 , -3.28 , -3.1  , -0.59 , -5.6  , -4.48 ,
       -3.4  , -2.89 ,  0.81 , -1.87 , -3.54 , -3.04 , -1.62 , -1.46 ,
       -2.   , -4.29 , -2.29 , -0.17 , -2.06 , -4.07 , -3.27 , -3.75 ,
       -1.97 , -4.14 , -3.39 , -2.09 , -4.679, -2.68 , -2.68 , -1.74 ,
       -3.999, -4.22 , -2.43 , -2.37 , -4.43 , -3.81 , -4.35 , -5.06 ,
       -3.08 , -1.73 ,  0.   , -1.94 , -1.24 , -1.32 , -2.   , -2.03 ,
       -4.   , -3.12 , -3.93 , -2.73 , -1.47 , -3.63 , -5.51 , -4.8  ,
       -4.17 , -1.81 , -3.73 , -3.01 , -7.   , -1.24 , -3.23 , -0.59 ,
       -2.36 , -2.96 , -4.81 , -4.55 , -2.29 , -3.27 , -5.22 , -3.7  ,
       -5.85 , -0.807, -2.22 , -1.92 ])

In [6]:
solubility_data_train = solubility_data_train.drop(columns = ['logS'])
solubility_data_train

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion
0,2.59540,167.850,0.0,0.000000
1,2.37650,133.405,0.0,0.000000
2,2.59380,167.850,1.0,0.000000
3,2.02890,133.405,1.0,0.000000
4,2.91890,187.375,1.0,0.000000
...,...,...,...,...
95,3.14822,142.201,0.0,0.909091
96,4.30142,192.261,0.0,0.933333
97,-0.92640,126.115,0.0,0.666667
98,2.54540,144.173,0.0,0.909091


# Training the model
### We need to scale down the MolWt column first

In [7]:
feature_matrix = solubility_data_train.to_numpy()

# Array to be added as column
bias_column = np.array([[1] for i in range(len(feature_matrix))])

# Adding column to array using append() method
feature_matrix = np.concatenate([feature_matrix, bias_column], axis = 1)  # axis = 1 means column, 0 means row
for i in range(len(feature_matrix)):
    feature_matrix[i][1] /= 100.0
# feature_matrix

In [8]:
def costFunction(theta_arr):
    y_cap = np.matmul(feature_matrix, np.transpose(theta_arr))
    y_cap = np.subtract(dependent_feature, y_cap)
    return abs(np.sum([i ** 2 for i in y_cap])) / (2.0 * float(len(dependent_feature)))

In [9]:
def gradientDescent(theta_arr, learning_rate):
    y_cap = np.matmul(feature_matrix, np.transpose(theta_arr))
    y_cap = np.subtract(dependent_feature, y_cap)
    y_cap = np.matmul(np.transpose(feature_matrix), y_cap)
    y_cap = np.array([i / float(len(dependent_feature)) for i in y_cap])
    y_cap = np.add(theta_arr, y_cap * learning_rate)
    return y_cap

In [10]:
lr_value = 5 * (10 ** -5)  # learning rate
currentCost = 0
theta_array = [0 for row in feature_matrix[0]]

In [11]:
for i in range(10000):
    theta_array = gradientDescent(theta_array, lr_value)
currentCost = costFunction(theta_array)
print(f"Current Cost: {currentCost}")

Current Cost: 0.2911920630008252


# Training the model using Scikit learn

In [12]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
dummy_variable = lr.fit(solubility_data_train, dependent_feature)

In [13]:
dependent_feature_test = np.array(solubility_data_test['logS'])

In [14]:
solubility_data_test = solubility_data_test.drop(columns = ['logS'])
solubility_data_test

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion
100,2.7480,173.171,1.0,0.769231
101,0.6731,89.094,2.0,0.000000
102,2.7293,144.258,7.0,0.000000
103,3.5329,126.243,6.0,0.000000
104,2.9801,124.227,5.0,0.000000
...,...,...,...,...
195,0.9854,72.107,1.0,0.000000
196,0.7614,70.091,1.0,0.000000
197,0.7954,118.176,5.0,0.000000
198,2.4138,106.596,1.0,0.000000


# Predicted values using Scikit learn model

In [15]:
sci_predicted = lr.predict(solubility_data_test)
sci_predicted

array([ -3.39406247,  -0.60058505,  -3.51283118,  -4.24573447,
        -3.53407106,  -8.6246974 ,  -2.94484605,  -3.67774933,
        -2.96608592,  -6.92074199,  -1.24089064,  -1.97379393,
        -1.26213052,  -2.08754635,  -0.10492037,  -6.35275686,
       -12.09363613, -10.4804068 ,  -8.87290139, -10.4804068 ,
        -8.87290139,  -7.27326637,  -9.67582563,  -9.67582563,
        -8.87290139,  -8.07192647,  -8.07192647,  -8.87290139,
        -7.27326637,  -2.65349934,  -8.87290139,  -8.87290139,
        -7.27326637,  -8.07192647,  -3.22148448,  -3.78946961,
        -7.27326637,  -1.45463269,  -2.34186144,  -2.90984657,
        -2.02261782,  -0.78875169,  -5.68487476,  -8.87290139,
        -9.67582563,  -8.87290139,  -8.07192647,  -8.07192647,
        -7.27326637,  -4.74578071,  -4.74578071,  -6.47738405,
        -6.47738405,  -3.95204486,  -3.16092897,  -4.74578071,
        -6.47738405,  -3.95204486,  -3.95204486,  -3.51251169,
        -3.16403294,  -2.09376601,  -2.28130593,  -4.05

# Predicted values using custom trained model

In [16]:
feature_matrix_test = solubility_data_test.to_numpy()

# Array to be added as column
bias_column = np.array([[1] for i in range(len(feature_matrix))])

# Adding column to array using append() method
feature_matrix_test = np.concatenate([feature_matrix_test, bias_column], axis = 1)  # axis = 1 means column, 0 means row
for i in range(len(feature_matrix_test)):
    feature_matrix_test[i][1] /= 100.0

custom_predicted = np.matmul(feature_matrix_test, np.transpose(theta_array))
custom_predicted

array([ -3.26021171,  -1.31350984,  -3.69813837,  -4.13381324,
        -3.58429116,  -7.95389966,  -3.225276  ,  -3.66095088,
        -3.11142879,  -6.53531256,  -1.80668891,  -2.24236378,
        -1.6928417 ,  -2.2551385 ,  -0.86096418,  -6.0624502 ,
       -10.1004748 ,  -8.81202324,  -7.52524176,  -8.81202324,
        -7.52524176,  -6.24075666,  -8.16839078,  -8.16839078,
        -7.52524176,  -6.88266151,  -6.88266151,  -7.52524176,
        -6.24075666,  -2.69507579,  -7.52524176,  -7.52524176,
        -6.24075666,  -6.88266151,  -3.16793815,  -3.64080052,
        -6.24075666,  -1.94211479,  -2.44675985,  -2.91962221,
        -2.41497715,  -1.35636236,  -4.95955208,  -7.52524176,
        -8.16839078,  -7.52524176,  -6.88266151,  -6.88266151,
        -6.24075666,  -4.33266093,  -4.33266093,  -5.59966229,
        -5.59966229,  -3.69219285,  -3.16917184,  -4.33266093,
        -5.59966229,  -3.69219285,  -3.69219285,  -3.42585664,
        -3.05339485,  -2.30758578,  -2.44799354,  -3.63

# Actual test results

In [17]:
dependent_feature_test

array([ -3.54,  -0.8 ,  -3.01,  -5.05,  -4.24,  -8.4 ,  -2.39,  -4.44,
        -3.66,  -6.35,  -0.6 ,  -2.68,  -1.64,  -0.92,   0.62,  -5.84,
       -11.6 ,  -9.16,  -8.01,  -9.15,  -8.6 ,  -7.28,  -7.92,  -8.94,
        -7.68,  -7.21,  -7.43,  -7.42,  -6.47,  -4.36,  -8.56,  -8.71,
        -6.57,  -7.32,  -4.74,  -5.05,  -7.39,  -1.04,  -3.55,  -4.36,
        -1.52,  -0.4 ,  -5.27,  -7.82,  -7.66,  -7.39,  -7.8 ,  -7.92,
        -7.25,  -3.15,  -3.1 ,  -6.29,  -6.26,  -2.67,  -4.8 ,  -3.37,
        -6.01,  -2.67,  -2.64,  -3.48,  -1.3 ,  -2.4 ,  -3.65,  -4.72,
        -4.28,   0.38,  -6.27,  -6.25,  -2.21,  -6.14,  -2.34,  -2.05,
        -3.22,  -1.55,  -0.92,  -1.22,  -1.3 ,  -4.26,  -1.19,   0.38,
        -2.82,  -5.28,  -5.25,  -1.79,  -4.89,  -1.29,   0.45,  -3.  ,
        -5.21,  -1.94,  -5.17,  -4.4 ,  -4.4 ,  -1.59,  -2.23,   0.52,
         0.32,  -0.42,  -2.51,  -2.46])

# Total cost using Custom-trained model

In [18]:
y_cap_scikit = np.subtract(dependent_feature_test, custom_predicted)
custom_model_cost = abs(np.sum([i ** 2 for i in y_cap_scikit])) / (2.0 * float(len(dependent_feature)))
print(f"Custom-trained model cost = {custom_model_cost}")

Custom-trained model cost = 0.5259537336478181


# Total cost using Scikit-learn model

In [19]:
y_cap_scikit = np.subtract(dependent_feature_test, sci_predicted)
predicted_model_cost = abs(np.sum([i ** 2 for i in y_cap_scikit])) / (2.0 * float(len(dependent_feature)))
predicted_model_cost
print(f"Scikit-learn model cost = {predicted_model_cost}")

Scikit-learn model cost = 0.5274742054607415


***