In [None]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import mysql.connector
from mysql.connector import Error
import config
from rdkit.DataStructs import TanimotoSimilarity
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import gc
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import vstack, hstack, csr_matrix
from rdkit.Chem import Draw
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from scipy.sparse import vstack

In [None]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [None]:
X_train = train_df.drop(['id', 'binds'], axis=1)
y_train = train_df['binds']

X_test = test_df.drop(['id', 'binds'], axis=1)
y_test = test_df['binds']

In [None]:
# Convert string lists to actual lists
X_train['molVector'] = X_train['molVector']
print(X_train.dtypes)
print(X_train.columns)

In [None]:
# Initialize OneHotEncoder for categorical variable transformation
encoder = OneHotEncoder(sparse_output=True)
print('Generating encoder values...')
# Fit the encoder on the 'Protein_numeric' data from the training set
encoder.fit(train_df[['Protein_numeric']])

print('Applying values to training dataframe...')
# Transform 'Protein_numeric' for the training dataset
protein_onehot_train = encoder.transform(train_df[['Protein_numeric']])
print('Applying values to testing dataframe...')
# Transform 'Protein_numeric' for the testing dataset using the same encoder
protein_onehot_test = encoder.transform(test_df[['Protein_numeric']])

# Check if 'molVector' needs to be converted to a sparse matrix
if isinstance(train_df['molVector'].iloc[0], np.ndarray):
    molVector_train_sparse = csr_matrix(np.vstack(train_df['molVector']))
    molVector_test_sparse = csr_matrix(np.vstack(test_df['molVector']))
else:
    molVector_train_sparse = csr_matrix(train_df['molVector'].tolist())
    molVector_test_sparse = csr_matrix(test_df['molVector'].tolist())

# Use hstack to concatenate 'molVector' with one-hot encoded 'Protein_numeric' vectors
X_train = hstack([molVector_train_sparse, protein_onehot_train])
X_test = hstack([molVector_test_sparse, protein_onehot_test])

# Directly use 'binds' column for targets
y_train = train_df['binds'].values
y_test = test_df['binds'].values

In [None]:
del train_df, test_df
del SMOTE, TanimotoSimilarity, RandomForestClassifier
del protein_onehot_train, protein_onehot_test
gc.collect()

In [None]:
def load_data_in_chunks(X_data, y_data, chunk_size=100000):
    X_data = np.array(X_data)  # Convert list to NumPy array
    y_data = np.array(y_data)  # Convert list to NumPy array
    n_samples = X_data.shape[0]
    for start in range(0, n_samples, chunk_size):
        end = start + chunk_size
        yield X_data[start:end], y_data[start:end]


# Initialize the XGBoost model
model_params = {'n_estimators': 100, 'objective': 'binary:logistic', 'random_state': 42, 'n_jobs': -1}

# Training the model in chunks
first_chunk = True
for X_chunk, y_chunk in load_data_in_chunks(X_train, y_train):
    dtrain = xgb.DMatrix(X_chunk, label=y_chunk)
    if first_chunk:
        model = xgb.train(model_params, dtrain, num_boost_round=10)
        first_chunk = False
    else:
        bst = model.train(model_params, dtrain, num_boost_round=10, xgb_model=model)


In [None]:
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))

print(classification_report(y_test, predictions, target_names=['Class 0', 'Class 1']))

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)

fpr, tpr, thresholds = roc_curve(y_test, probabilities)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

print("AUC Score:", roc_auc)
