<a href="https://colab.research.google.com/github/BrouthenKamel/HAICK-2023/blob/main/HAICK_2023_PCMB_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive
data_path = '/gdrive/MyDrive/molecules/'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
train_paths = ['ERBB1.csv', 'ERBB2.csv','FLT-3.csv','HDACL1.csv','LCK.csv',]
test_paths = ['test_ERBB1_Target_Descriptors.csv','test_ERBB2_Target_Descriptors.csv','test_FLT-3_Target_Descriptors.csv','test_HDACL1_Target_Descriptors.csv','test_LCK_Target_Descriptors.csv']

In [None]:
def train_func(path, train_paths, test_paths, pca_rate, test_size):
  accuracies = []
  submissions = {
      'Id' : [] ,
      'Activity' : []
  }

  for p,q in zip(train_paths, test_paths):
    #prepare data
    print("\nTreating the file : ", p)
    df = pd.read_csv(path + p)
    df = df.fillna(df.mean())
    df = df.drop_duplicates()
    df["Activity"].replace({'active':1, 'nonactive':0}, inplace = True)
    
    # X and y
    X = df.drop(columns = ['Activity', 'CHEMBL_ID'])
    y = df["Activity"]

    # Standardize
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    X_std = pd.DataFrame(X_std, columns=X.columns)

    #PCA
    pca = PCA(n_components = pca_rate)
    X_pca = pca.fit_transform(X_std)
    X_pca = pd.DataFrame(X_pca)

    #Split
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=test_size, stratify = y , random_state=42)

    #Search the params
    param_grid = {
        'learning_rate': [0.1, 0.08, 1.02],
        'max_depth': [15, 20],
        'n_estimators': [700, 800],
    }

    # Create the XGBoost classifier
    clf = XGBClassifier(n_jobs=-1)

    # Create the grid search object
    grid_search = GridSearchCV(
        clf, param_grid=param_grid, cv=4, scoring='accuracy', n_jobs=-1
    )

    # Fit the grid search to the data
    grid_search.fit(X_pca, y)

    # Print the best hyperparameters and score
    print("Best hyperparameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)

    #model
    xgb = grid_search.best_estimator_
    xgb.fit(X_train, y_train)

    #test
    y_pred = xgb.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    #prediction
    df_test = pd.read_csv(path + q)

    df_test = df_test.fillna(df_test.mean())

    ids = df_test['CHEMBL_ID']
    df_test = df_test.drop(columns = ['CHEMBL_ID', "Unnamed: 0"])

    #prepare_test
    df_test_scaled = scaler.transform(df_test)
    df_test_pca = pca.transform(df_test_scaled)
    df_test_pca = pd.DataFrame(df_test_pca)

    predictions = xgb.predict(df_test_pca)

    #construct submission
    for id, prediction in zip(ids, predictions):
      submissions["Id"].append(id)
      if( prediction == 1 ):
        submissions["Activity"].append('active')
      else:
        submissions["Activity"].append('nonactive')

  return accuracies, submissions

In [None]:
dataframes = dict()
for pca_rate in [0.95, 0.99]:
  acc, submission = train_func(path=data_path, train_paths=train_paths, test_paths=test_paths, pca_rate=pca_rate, test_size=0.1)
  dataframes[str(pca_rate)] = (acc, submission)

In [None]:
for pca_rate in [0.97, 0.98, 0.99]:
  print(pca_rate, " -> ", dataframes[str(pca_rate)][0])
  mean = 0
  for  i in dataframes[str(pca_rate)][0]:
    mean += i
  print("mean = ", mean/5)

In [None]:
submit = pd.DataFrame( dataframes[str(0.99)][1])
submit.to_csv('/content/pcmb_pause.csv', index=False)

**DEEP LEARNING**

In [None]:
data = pd.read_csv(data_path + train_paths[0])
data = data.fillna(data.mean())
data = data.drop_duplicates()
data["Activity"].replace({'active':1, 'nonactive':0}, inplace = True)
X = data.drop(columns = ['Activity', 'CHEMBL_ID'])
y = data["Activity"]

# Standardize
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

#PCA
pca = PCA(n_components = 0.99)
X_pca = pca.fit_transform(X_std)
X_pca = pd.DataFrame(X_pca)

#Split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, stratify = y , random_state=42)

In [None]:
import tensorflow as tf

In [None]:
# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model with appropriate loss function and optimizer
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model on the training data
model.fit(X_train, y_train, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fab180df250>

In [None]:
# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test)

