In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import mysql.connector
from mysql.connector import Error
import config
from rdkit.DataStructs import TanimotoSimilarity
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import vstack, hstack, csr_matrix
from rdkit.Chem import Draw
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import torch
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_curve, auc, average_precision_score)

In [2]:
def connect_to_database():
    """Connect to the MySQL database using settings from the config module."""
    try:
        print("Connecting to the MySQL database...")
        conn = mysql.connector.connect(**config.DATABASE_CONFIG)
        if conn.is_connected():
            print("Connection established.")
        else:
            print("Connection failed.")
        return conn
    except Error as e:
        print(f"The error '{e}' occurred")
        return None

def close_connection(conn):
    """Close the database connection."""
    if conn.is_connected():
        conn.close()
        print("The connection is closed.")

def fetch_protein_mapping(cursor):
    query = "SELECT protein_name, protein_numeric FROM protein_mapping"
    cursor.execute(query)
    mapping_data = cursor.fetchall()
    print('fetched protein mapping...')
    return {name: num for name, num in mapping_data}

# Getting CSV Ready for Training

In [None]:
train_df = pd.read_csv('Cleaned-csv/train_data_strings.csv')
test_df = pd.read_csv('Cleaned-csv/test_data_strings.csv')

In [None]:
train_df['molVector'] = train_df['molVector'].apply(lambda x: [int(i) for i in x.split(',')])
test_df['molVector'] = test_df['molVector'].apply(lambda x: [int(i) for i in x.split(',')])

In [None]:
first_row_vector = train_df['molVector'].iloc[0]
max_len = len(first_row_vector)
print(max_len)

In [None]:
"""about 6 minutes for this code"""
molVector_train_df = pd.DataFrame(train_df['molVector'].to_list(), columns=[f"molVect{i + 1}" for i in range(max_len)])
molVector_test_df = pd.DataFrame(test_df['molVector'].to_list(), columns=[f"molVect{i + 1}" for i in range(max_len)])

In [None]:
print(molVector_train_df.dtypes)

In [None]:
"""Takes around 12 minutes for this code"""
for col in molVector_train_df.columns:
    molVector_train_df[col] = molVector_train_df[col].astype(np.int8)
print('Changed type to int8...')
train_df = pd.concat([train_df.drop(['molVector'], axis=1), molVector_train_df], axis=1)
print('combined training set...')

for col in molVector_test_df.columns:
    molVector_test_df[col] = molVector_test_df[col].astype(np.int8)
print('Changed type to int8...')
test_df = pd.concat([test_df.drop(['molVector'], axis=1), molVector_test_df], axis=1)
print('combined testing set...')


In [None]:
train_df.to_csv('Cleaned-csv/Cleaned_train_df-512bit-2rad.csv', index=False)
test_df.to_csv('Cleaned-csv/Cleaned_test_df-512bit-2rad.csv', index=False)

# Load Cleaned CSV

In [2]:
train_df = pd.read_csv('Cleaned-csv/Cleaned_train_df-512bit-2rad.csv')
test_df = pd.read_csv('Cleaned-csv/Cleaned_test_df-512bit-2rad.csv')

In [3]:
print(train_df.dtypes)

id                 int64
binds              int64
Protein_numeric    int64
molVect1           int64
molVect2           int64
                   ...  
molVect508         int64
molVect509         int64
molVect510         int64
molVect511         int64
molVect512         int64
Length: 515, dtype: object


In [4]:
# Drop 'id' column from training and testing data
train_df = train_df.drop(columns=['id'])
test_df = test_df.drop(columns=['id'])

# Define target variable for training data
y_train = train_df['binds']
X_train = train_df.drop(columns=['binds'])

# Define target variable for testing data (if applicable)
y_test = test_df['binds']
X_test = test_df.drop(columns=['binds'])


In [5]:
del train_df, test_df
gc.collect()

73

# XGBoost Training

In [6]:
# Create XGBoost specific DMatrix data format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [7]:
# Set parameters
params = {
    'objective': 'binary:logistic',
    'n_estimators': 1000,  
    'learning_rate':0.005,
    'alpha': 10,  
    'lambda': 1,  
    'max_depth': 3,
    'min_child_weight': 50,
    'n_jobs': -1
}

# Train model
model = xgb.train(params, dtrain)

Parameters: { "n_estimators" } are not used.



In [9]:
model.save_model('Trained-Models/xgb_model-2-0.json')

# Neural Network

In [12]:
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA Available: ", cuda_available)

# Get the number of GPUs available
if cuda_available:
    num_gpus = torch.cuda.device_count()
    print("Number of GPUs Available: ", num_gpus)
    for i in range(num_gpus):
        print("GPU ", i, ": ", torch.cuda.get_device_name(i))
else:
    print("No GPU available, using CPU instead.")


CUDA Available:  True
Number of GPUs Available:  1
GPU  0 :  NVIDIA GeForce RTX 3070


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

In [15]:
# Assuming 'X_train', 'X_test' are DataFrames or Series and 'y_train', 'y_test' are Series
X_train_np = X_train.values  # Convert DataFrame to NumPy array
y_train_np = y_train.values  # Convert Series to NumPy array
X_test_np = X_test.values    # Convert DataFrame to NumPy array
y_test_np = y_test.values    # Convert Series to NumPy array
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)

In [30]:

# Define the dataset class
class BinaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure that tensors returned by __getitem__ are of type FloatTensor
        return self.features[idx], self.labels[idx]

class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        # More nuanced layer design
        self.layer1 = nn.Linear(input_size, 512)
        self.norm1 = nn.BatchNorm1d(512)
        self.act1 = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU to prevent dying neurons
        self.dropout1 = nn.Dropout(0.3)  # Slightly lower dropout for this layer

        self.layer2 = nn.Linear(512, 256)
        self.norm2 = nn.BatchNorm1d(256)
        self.act2 = nn.LeakyReLU(negative_slope=0.01)
        self.dropout2 = nn.Dropout(0.3)

        self.layer3 = nn.Linear(256, 128)
        self.norm3 = nn.BatchNorm1d(128)
        self.act3 = nn.LeakyReLU(negative_slope=0.01)
        self.dropout3 = nn.Dropout(0.4)  # Increased dropout for deeper layers

        self.layer4 = nn.Linear(128, 64)
        self.norm4 = nn.BatchNorm1d(64)
        self.act4 = nn.LeakyReLU(negative_slope=0.01)
        self.dropout4 = nn.Dropout(0.4)

        self.output_layer = nn.Linear(64, 1)
        self.output_act = nn.Sigmoid()  # Output layer activation

    def forward(self, x):
        x = self.layer1(x)
        x = self.norm1(x)
        x = self.act1(x)
        x = self.dropout1(x)

        x = self.layer2(x)
        x = self.norm2(x)
        x = self.act2(x)
        x = self.dropout2(x)

        x = self.layer3(x)
        x = self.norm3(x)
        x = self.act3(x)
        x = self.dropout3(x)

        x = self.layer4(x)
        x = self.norm4(x)
        x = self.act4(x)
        x = self.dropout4(x)

        x = self.output_layer(x)
        x = self.output_act(x)
        return x

# Assuming X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor are already defined
# Create dataset instances
train_dataset = BinaryDataset(X_train_tensor, y_train_tensor)
test_dataset = BinaryDataset(X_test_tensor, y_test_tensor)

# Create DataLoader instances for batch processing and shuffling
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model and move it to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BinaryClassifier(input_size=513).to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Training and validation functions
def train_model(model, train_loader, criterion, optimizer, epochs):
    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
            optimizer.zero_grad()  # Clear gradients
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))  # Calculate loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights
            total_loss += loss.item()  # Accumulate loss

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')
        validate_model(model, test_loader, criterion)

def validate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            total_loss += loss.item()

            # Collect labels and predictions for metrics calculation
            all_labels.extend(labels.cpu().numpy())
            probabilities = outputs.cpu().numpy()  # Assuming sigmoid output
            all_probabilities.extend(probabilities)
            predicted = (probabilities > 0.5).astype(int)
            all_predictions.extend(predicted.squeeze())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    average_precision = average_precision_score(all_labels, all_probabilities)

    # Print results
    print(f"Validation Loss: {total_loss / len(test_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Mean Average Precision: {average_precision:.4f}")

    # Confusion Matrix
    conf_matrix = confusion_matrix(all_labels, all_predictions)
    print("Confusion Matrix:\n", conf_matrix)


# Run training
train_model(model, train_loader, criterion, optimizer, 10)  # Train for 100 epochs

Epoch 1/10, Loss: 0.0144
Validation Loss: 29.0437
Accuracy: 0.5800
Precision: 0.5436
Recall: 0.9979
F1 Score: 0.7038
Average Precision: 0.6287
Confusion Matrix:
 [[ 95593 494313]
 [  1266 588640]]
Epoch 2/10, Loss: 0.0111
Validation Loss: 11.7015
Accuracy: 0.5740
Precision: 0.5401
Recall: 0.9972
F1 Score: 0.7007
Average Precision: 0.7523
Confusion Matrix:
 [[ 88966 500940]
 [  1675 588231]]
Epoch 3/10, Loss: 0.0106
Validation Loss: 4.8915
Accuracy: 0.5756
Precision: 0.5409
Recall: 0.9992
F1 Score: 0.7019
Average Precision: 0.9097
Confusion Matrix:
 [[ 89597 500309]
 [   461 589445]]
Epoch 4/10, Loss: 0.0103
Validation Loss: 4.0628
Accuracy: 0.5751
Precision: 0.5406
Recall: 0.9995
F1 Score: 0.7017
Average Precision: 0.9331
Confusion Matrix:
 [[ 88821 501085]
 [   275 589631]]
Epoch 5/10, Loss: 0.0102
Validation Loss: 3.8709
Accuracy: 0.5739
Precision: 0.5399
Recall: 0.9994
F1 Score: 0.7011
Average Precision: 0.9049
Confusion Matrix:
 [[ 87607 502299]
 [   377 589529]]
Epoch 6/10, Loss: 

In [31]:
torch.save(model, 'Trained-Models/Torch-model1-1.pth')

In [None]:
# Define a dataset class for PyTorch
class MolecularDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.float32)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Convert SMILES to fingerprint
def smiles_to_fingerprint(smiles, radius=2, nBits=512):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))
    else:
        return [0]*nBits

# Process SMILES strings in batches
def process_in_batches(smiles_series, batch_size=1000):
    results = []
    for i in tqdm(range(0, len(smiles_series), batch_size), desc="Processing Batches"):
        batch = smiles_series[i:i+batch_size].apply(smiles_to_fingerprint)
        results.extend(batch)
    return results

# Load and process the data
test_submission_data = pd.read_csv('leash-BELKA/test.csv')
test_submission_df = pd.DataFrame()
test_submission_df['molecule_smiles'] = process_in_batches(test_submission_data['molecule_smiles'], batch_size=500)

In [5]:

# Connect to DB and fetch protein mappings
conn = connect_to_database()
cursor = conn.cursor()
protein_mapping = fetch_protein_mapping(cursor)
cursor.close()
conn.close()

# Map protein names to numeric
test_submission_df['Protein_numeric'] = test_submission_data['protein_name'].map(protein_mapping)

# Create dataset
features = np.column_stack((test_submission_df['molecule_smiles'].tolist(), test_submission_df['Protein_numeric'].values))
test_dataset = MolecularDataset(features)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Connecting to the MySQL database...
Connection established.
fetched protein mapping...


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load PyTorch model and ensure it is on the correct device
model = torch.load('Trained-Models/Torch-model1-1.pth')
model.to(device)
model.eval()  # Set model to evaluation mode

# Predict function for PyTorch model
def predict(model, dataloader):
    predictions = []
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            data = data.to(device)  # Move data to the same device as the model
            outputs = model(data)
            predicted_probs = torch.sigmoid(outputs).cpu().numpy()  # Convert outputs to probabilities and move to CPU
            predictions.extend(predicted_probs.flatten())
    return predictions

# Perform predictions
predictions = predict(model, test_loader)

# Save predictions
test_submission_data['binds'] = predictions
test_submission_data[['id', 'binds']].to_csv('Submission-Files/Submission2Pytorch.csv', index=False)

Using device: cuda


# Model Results

In [10]:
# Make the prediction
raw_predictions_train = model.predict(dtrain)

# Create a dataframe with the predicted probabilities
probabilities_train = pd.DataFrame(raw_predictions_train)
print(probabilities_train.head())

# Convert probabilities into binary output
predictions_train = [1 if proba > 0.5 else 0 for proba in raw_predictions_train]

# Now print the accuracy with these classes
print("Accuracy:", accuracy_score(y_train, predictions_train))
print("Precision:", precision_score(y_train, predictions_train))
print("Recall:", recall_score(y_train, predictions_train))
print("F1 Score:", f1_score(y_train, predictions_train))

print(classification_report(y_train, predictions_train, target_names=['Class 0', 'Class 1']))

conf_matrix_train = confusion_matrix(y_train, predictions_train)
print("Confusion Matrix:\n", conf_matrix_train)

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probabilities_train)
roc_auc_train = auc(fpr_train, tpr_train)

print("AUC Score (Training):", roc_auc_train)

# Average Precision Score
average_precision_train = average_precision_score(y_train, probabilities_train)
print('Mean Average Precision (Training): {0:0.2f}'.format(average_precision_train))


          0
0  0.475814
1  0.524442
2  0.524442
3  0.475814
4  0.524442
Accuracy: 0.9973715
Precision: 1.0
Recall: 0.994743
F1 Score: 0.997364572779551
              precision    recall  f1-score   support

     Class 0       0.99      1.00      1.00   1000000
     Class 1       1.00      0.99      1.00   1000000

    accuracy                           1.00   2000000
   macro avg       1.00      1.00      1.00   2000000
weighted avg       1.00      1.00      1.00   2000000

Confusion Matrix:
 [[1000000       0]
 [   5257  994743]]
AUC Score (Training): 0.9973719999999999
Mean Average Precision (Training): 1.00


In [11]:
# Make the prediction
raw_predictions = model.predict(dtest)

# Create a dataframe with the predicted probabilities  
probabilities = pd.DataFrame(raw_predictions)
print(probabilities.head())

# Convert probabilities into binary output
predictions = [1 if proba > 0.5 else 0 for proba in raw_predictions]

# Now print the accuracy with these classes
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))

print(classification_report(y_test, predictions, target_names=['Class 0', 'Class 1']))

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)

fpr, tpr, thresholds = roc_curve(y_test, probabilities)
roc_auc = auc(fpr, tpr)

print("AUC Score:", roc_auc)

# Average Precision Score
average_precision = average_precision_score(y_test, probabilities)
print('Mean Average Precision (micro): {0:0.2f}'.format(average_precision))

          0
0  0.524442
1  0.524442
2  0.524442
3  0.524442
4  0.524412
Accuracy: 0.5733727068380386
Precision: 0.5396096777086548
Recall: 0.9995694229250084
F1 Score: 0.7008636407938935
              precision    recall  f1-score   support

     Class 0       1.00      0.15      0.26    589906
     Class 1       0.54      1.00      0.70    589906

    accuracy                           0.57   1179812
   macro avg       0.77      0.57      0.48   1179812
weighted avg       0.77      0.57      0.48   1179812

Confusion Matrix:
 [[ 86820 503086]
 [   254 589652]]
AUC Score: 0.9889508116939492
Mean Average Precision (micro): 0.99


# Code for Creating Submission Using Model

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import gc

# Load CSV data
test_submission_data = pd.read_csv('leash-BELKA/test.csv')

# Function to convert SMILES string to a fingerprint
def smiles_to_fingerprint(smiles, radius=2, nBits=512):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits))
    else:
        return [0]*nBits  # Return a zero vector if the molecule parsing fails

# Process SMILES strings in batches
def process_in_batches(smiles_series, batch_size=1000):
    num_batches = (len(smiles_series) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(num_batches), desc="Processing Batches"):
        batch = smiles_series.iloc[i*batch_size:(i+1)*batch_size].apply(smiles_to_fingerprint)
        results.extend(batch)
    return pd.Series(results)

# Create dataframe and process fingerprints
test_submission_df = pd.DataFrame()
test_submission_df['id'] = test_submission_data['id']
test_submission_df['molecule_smiles'] = process_in_batches(test_submission_data['molecule_smiles'], batch_size=500)

In [None]:
# Assuming you have a way to fetch protein mappings similar to the database method shown

conn = connect_to_database()
cursor = conn.cursor()
protein_mapping = fetch_protein_mapping(cursor)
cursor.close()
conn.close()

In [None]:
test_submission_df['Protein_numeric'] = test_submission_data['protein_name'].map(protein_mapping)

In [None]:
# Adjust based on your fingerprint length
"""Expect about 9 minutes for this code"""
molVector_test_df = pd.DataFrame(test_submission_df['molecule_smiles'].tolist(), columns=[f"molVect{i + 1}" for i in range(max_len)])
print('converted to many columns...')
for col in molVector_test_df.columns:
    molVector_test_df[col] = molVector_test_df[col].astype(np.int8)
print('Converted type to int8...')
test_submission_df = pd.concat([test_submission_df.drop(['molecule_smiles'], axis=1), molVector_test_df], axis=1)

In [29]:
test_submission_df['id'] = test_submission_data['id']
test_submission_df.to_csv('Cleaned-csv/final-submission-cleaned-512bits-2rad.csv', index=False)

In [None]:
# To load model
model = xgb.Booster()
model.load_model('Trained-Models/xgb_model.json')

In [None]:

# Prepare data for prediction
dtest1 = xgb.DMatrix(test_submission_df.drop(['id'], axis=1))

In [None]:
raw_predictions = model.predict(dtest1)

# Save predictions
test_submission_df['binds'] = raw_predictions

In [None]:
test_submission_df[['id', 'binds']].to_csv('Submission-Files/Submission2XGBoost.csv', index=False)