# Module 1: Basics of Machine Learning
## Part 1: Regression

In this notebook, we illustrate the application of PyTorch to regress the aqueous solubility of different molecules using a neural network model.

### 1. Install and load python libraries

In [None]:
!pip install torch numpy matplotlib scikit-learn pandas rdkit-pypi

In [None]:
# 1. Required Libraries
import torch  # PyTorch main package
import torch.nn as nn  # Neural network modules
import torch.optim as optim  # Optimization algorithms
import numpy as np  # Numerical computations
import matplotlib.pyplot as plt  # Plotting library
from sklearn.datasets import make_regression  # To generate synthetic regression data
from sklearn.model_selection import train_test_split  # To split data into train/test sets
from sklearn.preprocessing import StandardScaler  # To standardize data
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pandas as pd # Pandas for handling input data
from rdkit import Chem # Work with molecules
from rdkit.Chem import Draw # Draw molecules

### 2. Load and Prepare Solubility Data

In [None]:
!wget https://raw.githubusercontent.com/mcsorkun/AqSolDB/refs/heads/master/results/data_curated.csv
data = pd.read_csv('data_curated.csv',nrows=1000)

In [None]:
# Visualize molecules in the data set
smiles_list = data['SMILES'][0:10]
mols = [Chem.MolFromSmiles(s) for s in smiles_list]
Draw.MolsToGridImage(mols, molsPerRow=5)

In [None]:
# Choose some features and define X
descriptor_names=['MolWt', 'MolLogP', 'MolMR', 'HeavyAtomCount','NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds','NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings',
       'NumAliphaticRings', 'RingCount', 'TPSA', 'LabuteASA', 'BalabanJ', 'BertzCT']
X=data[descriptor_names].to_numpy()
# Define y as the solubility (logS)
y=data['Solubility'].to_numpy()

In [None]:
# Prepare the data set
y = y.reshape(-1, 1)  # Make y a column vector

# Split into training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale input features and target
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train)
y_test = torch.FloatTensor(y_test)

# Show
plt.figure(figsize=(6, 4))
plt.scatter(X[:, descriptor_names.index("MolWt")], y, alpha=0.5)
plt.xlabel('Feature')
plt.ylabel('Solubility')
plt.title('Scatter Plot of One Input Feature vs Solubility')
plt.grid(True)
plt.legend()
plt.show()

### 3. Define Machine Learning Setup (Neural Network Model, Loss, Optimizer)

In [None]:
class RegressionNN(nn.Module):
    def __init__(self, input_size):
        super(RegressionNN, self).__init__()

        # Define layers explicitly
        self.layer1 = nn.Linear(input_size, 64)
        self.activation1 = nn.ReLU()

        self.layer2 = nn.Linear(64, 32)
        self.activation2 = nn.ReLU()

        self.output_layer = nn.Linear(32, 1)  # No activation here (regression)

    def forward(self, x):
        x = self.activation1(self.layer1(x))
        x = self.activation2(self.layer2(x))
        x = self.output_layer(x)
        return x

In [None]:
# Instantiate Model
input_size = X_train.shape[1]
model = RegressionNN(input_size)

# Define Loss Function (MSE for regression)
criterion = nn.MSELoss()

# Define Optimizer (Adam with learning rate)
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 4. Train the Model

In [None]:
epochs = 200
train_losses = []
test_losses = []

for epoch in range(epochs):
    # --------- Training ---------
    model.train()
    optimizer.zero_grad()                 # Clear gradients
    predictions = model(X_train)         # Forward pass
    loss = criterion(predictions, y_train)  # Compute loss
    loss.backward()                      # Backpropagation
    optimizer.step()                     # Update weights
    train_losses.append(loss.item())

    # --------- Testing ---------
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test)
        test_loss = criterion(test_preds, y_test)
        test_losses.append(test_loss.item())

    # Print every 20 epochs
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

In [None]:
# Plot Training and Test Loss
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.title('Training and Test Loss Over Epochs')
plt.grid(True)
plt.show()

# 5. Make and Visualize Predictions

In [None]:
model.eval()
with torch.no_grad():
    y_pred_all = model(X_test).numpy()
    y_actual_all = y_test.numpy()

y_pred_all = scaler_y.inverse_transform(y_pred_all)
y_actual_all = scaler_y.inverse_transform(y_actual_all)

In [None]:
# Compute MAE and RMSE
mae = mean_absolute_error(y_actual_all, y_pred_all)
rmse = root_mean_squared_error(y_actual_all, y_pred_all)
print("Mean Absolute Error (MAE) is: ", mae)
print("Root Mean Squared Error (MAE) is: ", rmse)

In [None]:
# Plotting
plt.figure(figsize=(6,6))
plt.scatter(y_actual_all, y_pred_all, alpha=0.5)
plt.plot([y_actual_all.min(), y_actual_all.max()],
         [y_actual_all.min(), y_actual_all.max()], 'r--')
plt.xlim([y_actual_all.min(), y_actual_all.max()])
plt.ylim([y_actual_all.min(), y_actual_all.max()])
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Predicted vs Actual Solubility (Test Set)")
plt.grid(True)
plt.show()

In [None]:
# Plotting
plt.figure(figsize=(6,6))
plt.hist(y_actual_all- y_pred_all, alpha=0.8, range=(-8,8),bins=25)
#plt.plot([y_actual_all.min(), y_actual_all.max()],
#         [y_actual_all.min(), y_actual_all.max()], 'r--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Solubility error histogram (Test Set)")
plt.grid(True)
plt.show()

In [None]:
# Use gradients to estimate feature importance
model.eval()
X_test.requires_grad = True
output = model(X_test)
output.mean().backward()

# Feature importance = average of absolute gradients across samples
feature_importance = X_test.grad.abs().mean(dim=0)

print("Feature Importance:", feature_importance)

In [None]:
plt.figure(figsize=(6,6))
plt.barh(descriptor_names,feature_importance/torch.max(feature_importance))
plt.title("Feature importance (Test set)")
plt.xlabel("Feature importance")
#plt.xticks(rotation=90)
plt.show()