# load Data #

In [None]:
import pandas as pd

df = pd.read_csv("/content/delaney_solubility_with_descriptors.csv")
df

# Data Preparation #

## Data separation X and Y ##

In [None]:
y = df["logS"]
y

In [None]:
X = df.drop("logS", axis =1)
X

## Data Splitting ##

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 100)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

# Model Building #

## Linear Regression ##

###**Training the model**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

###**Applying the model to make prediction**

In [None]:
y_lr_train_predict = lr.predict(X_train)
y_lr_test_predict = lr.predict(X_test)

print(y_lr_train_predict,y_lr_test_predict)

###**Evaluate model performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

lr_train_mse = mean_squared_error(y_train,y_lr_train_predict)
lr_train_r2 = r2_score(y_train,y_lr_train_predict)


lr_test_mse = mean_squared_error(y_test,y_lr_test_predict)
lr_test_r2 = r2_score(y_test,y_lr_test_predict)

In [None]:
print("Lr MSE (train): ", lr_train_mse)
print("Lr r2 (train): ", lr_train_r2 )
print("Lr MSE (test): ", lr_test_mse)
print("Lr r2 (test): ", lr_test_r2)

In [None]:
lr_results = pd.DataFrame(["Linear regression",lr_train_mse, lr_train_r2, lr_test_mse,lr_test_r2]).transpose()
lr_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
lr_results

## **RandomForest ##**

### Training the model

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=5, random_state=100)
rf.fit(X_train, y_train)

###**Applying model**

In [None]:
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

###**Evaluate model performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [None]:
rf_results = pd.DataFrame(['Random forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
rf_results

## **xgboost**

### Training the model

In [None]:
from xgboost import XGBRegressor

# Create an instance of XGBRegressor
xg = XGBRegressor(max_depth= 3, random_state=100)

# Train the model on the training data
xg.fit(X_train, y_train)

###**Applying model**

In [None]:
# Predict values for the test data
y_xg_train_pred = xg.predict(X_train)
y_xg_test_pred = xg.predict(X_test)

###**Evaluate model performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

xg_train_mse = mean_squared_error(y_train, y_xg_train_pred)
xg_train_r2 = r2_score(y_train, y_xg_train_pred)

xg_test_mse = mean_squared_error(y_test, y_xg_test_pred)
xg_test_r2 = r2_score(y_test, y_xg_test_pred)

In [None]:
xg_results = pd.DataFrame(['XGboost', xg_train_mse, xg_train_r2, xg_test_mse, xg_test_r2]).transpose()
xg_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
xg_results

## **Model comparison**

In [None]:
df_models = pd.concat([lr_results, rf_results, xg_results], axis = 0 )
df_models

In [None]:
df_models.reset_index(drop=True)
     

#Data visualization of prediction results

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(5,5))
plt.scatter(x=y_train,
            y=y_lr_train_predict,  c="#7CAE00" ,alpha=0.3)

z = np.polyfit(y_train, y_lr_train_predict, 1)
p = np.poly1d(z)

plt.plot(y_train, p(y_train), '#F8766D')
plt.ylabel('Predict LogS')
plt.xlabel('Experimental LogS')
     

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(5,5))
plt.scatter(x=y_train,
            y=y_rf_train_pred ,  c="#7CAE00" ,alpha=0.3)

z = np.polyfit(y_train, y_rf_train_pred , 1)
p = np.poly1d(z)

plt.plot(y_train, p(y_train), '#F8766D')
plt.ylabel('Predict LogS')
plt.xlabel('Experimental LogS')

 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(5,5))
plt.scatter(x=y_train,
            y=y_xg_train_pred ,  c="#7CAE00" ,alpha=0.3)

z = np.polyfit(y_train, y_xg_train_pred , 1)
p = np.poly1d(z)

plt.plot(y_train, p(y_train), '#F8766D')
plt.ylabel('Predict LogS')
plt.xlabel('Experimental LogS')

In [None]:
!jupyter nbconvert --to html /content/delaney_solubility_with_descriptors.ipynb