# Model training

This cell installs the RDKit library, which is used for cheminformatics (processing chemical information). The RDKit library is necessary for working with molecular data.

In [None]:
!pip install rdkit



In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, StandardScaler, Normalizer, MinMaxScaler, QuantileTransformer, OneHotEncoder
from sklearn.metrics import average_precision_score, roc_curve, accuracy_score, roc_auc_score, classification_report, precision_recall_curve, auc, f1_score, precision_score, recall_score, fbeta_score, confusion_matrix
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from sklearn.pipeline import make_pipeline

This function converts a molecule in SMILES format to a fingerprint using the Morgan (circular) fingerprint method. This transformation is often used to convert molecular structures into numerical representations suitable for machine learning models.

In [None]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

def smiles_to_fingerprint(mol, radius=2, bits=256):
    generator = GetMorganGenerator(radius=radius, fpSize=bits)
    if mol is None:
        return None
    return list(generator.GetFingerprint(mol))

This function tests different scalers on the training and testing data using a specified model (here, RandomForestClassifier). It returns the F1 score for each scaler, allowing comparison of their performance.

In [None]:
def test_scaler(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, model)-> dict:
  scaler_list = [RobustScaler(), StandardScaler(), Normalizer(), MinMaxScaler(), QuantileTransformer(output_distribution='uniform')]
  my_dict = {}
  for scaler in scaler_list:
      pipeline = make_pipeline(
          scaler,
          model
      )
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      accuracy = accuracy_score(y_test, y_pred)
      my_dict[f'{scaler.__class__.__name__}'] = accuracy
  # print(f'Best scaler: {min(my_dict, key=my_dict.get)}')
  return {min(my_dict, key=my_dict.get) : my_dict[f'{min(my_dict, key=my_dict.get)}']}

In [None]:
train_path = '/content/prepared_train.csv'
# Загрузите CSV-файл в pandas DataFrame
train = pd.read_csv(train_path)

# Выполните фильтрацию и объединение данных
train = pd.concat([
    train[train['binds'] == 0].sample(n=30000, random_state=42),
    train[train['binds'] == 1].sample(n=30000, random_state=42)
]).reset_index(drop=True)


In [None]:
df = train.copy()

df['mol'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['bb1_mol'] = df['buildingblock1_smiles'].apply(Chem.MolFromSmiles)
df['bb2_mol'] = df['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
df['bb3_mol'] = df['buildingblock3_smiles'].apply(Chem.MolFromSmiles)

In [None]:
df['ecfp'] = df['mol'].apply(smiles_to_fingerprint) #creating finger print
df['molecular_weight'] = df['mol'].apply(Descriptors.MolWt)
df['bb1_w'] = df['bb1_mol'].apply(Descriptors.MolWt)
df['bb2_w'] = df['bb2_mol'].apply(Descriptors.MolWt)
df['bb3_w'] = df['bb3_mol'].apply(Descriptors.MolWt)


In [None]:
df = pd.get_dummies(df, columns=['protein_name'])

In [None]:
# Преобразование логических значений в целые числа
df['protein_name_BRD4'] = df['protein_name_BRD4'] /1
df['protein_name_HSA'] = df['protein_name_HSA'] /1
df['protein_name_sEH'] = df['protein_name_sEH'] /1

In [None]:
# Объединение всех колонок в один большой список
df['combined'] = df.apply(lambda row: row['ecfp'] + [row['molecular_weight'], row['bb1_w'], row['bb2_w'], row['bb3_w'], row['protein_name_BRD4'], row['protein_name_HSA'], row['protein_name_sEH']], axis=1)


This cell separates features (X) and the target variable (y), and splits the data into training and testing sets.

In [None]:
X = df['combined'].tolist()

In [None]:
y = df['binds'].tolist()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Обучение модели
model = make_pipeline(
    RandomForestClassifier(n_estimators=1000, random_state=42, criterion='entropy')
    )
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8891666666666667


In [None]:
# Make predictions on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

Mean Average Precision (mAP): 0.96


/\

Mean Average Precision (mAP) is a more nuanced metric often used in information retrieval and classification tasks, especially when dealing with imbalanced classes. It is the average of the precision values at different recall levels.

Precision measures how many of the items retrieved by your model are relevant.
Recall measures how many of the relevant items are retrieved by your model.
mAP essentially combines these two metrics to give you a single value that represents the average precision across all recall levels for your model.

A mAP of 0.96 means that on average, your model's precision is very high across different levels of recall. This is a strong indication that your model is not only good at retrieving relevant items but also maintains high precision when doing so.

Key Points:
Accuracy (0.8892): Indicates that your model's overall correctness is about 88.92%. This means it gets about 88.92% of its predictions right.
mAP (0.96): Shows that your model's precision is consistently high across different levels of recall, with an average precision score of 0.96. This suggests your model is very good at retrieving relevant items and has a high level of reliability in its predictions.
These metrics suggest that your model is performing well, both in general accuracy and in maintaining high precision across various levels of recall.

In [None]:
# Вычисление precision, recall и F1-score для каждого класса
precision_per_class = precision_score(y_test, y_pred, average=None)
recall_per_class = recall_score(y_test, y_pred, average=None)
f1_score_per_class = f1_score(y_test, y_pred, average=None)

# Вычисление микроусредненных метрик
micro_precision = precision_score(y_test, y_pred, average='micro')
micro_recall = recall_score(y_test, y_pred, average='micro')
micro_f1_score = f1_score(y_test, y_pred, average='micro')

# Вычисление макроусредненных метрик
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_f1_score = f1_score(y_test, y_pred, average='macro')

# Вычисление взвешенных метрик
weighted_precision = precision_score(y_test, y_pred, average='weighted')
weighted_recall = recall_score(y_test, y_pred, average='weighted')
weighted_f1_score = f1_score(y_test, y_pred, average='weighted')

# Вывод результатов
print("Precision per class:", precision_per_class)
print("Recall per class:", recall_per_class)
print("F1-score per class:", f1_score_per_class)
print("\nMicro-average Precision:", micro_precision)
print("Micro-average Recall:", micro_recall)
print("Micro-average F1-score:", micro_f1_score)
print("\nMacro-average Precision:", macro_precision)
print("Macro-average Recall:", macro_recall)
print("Macro-average F1-score:", macro_f1_score)
print("\nWeighted-average Precision:", weighted_precision)
print("Weighted-average Recall:", weighted_recall)
print("Weighted-average F1-score:", weighted_f1_score)

Precision per class: [0.87016977 0.90937231]
Recall per class: [0.91081401 0.86816615]
F1-score per class: [0.89002811 0.88829162]

Micro-average Precision: 0.8891666666666667
Micro-average Recall: 0.8891666666666667
Micro-average F1-score: 0.8891666666666667

Macro-average Precision: 0.8897710392727531
Macro-average Recall: 0.8894900796485992
Macro-average F1-score: 0.8891598653928514

Weighted-average Precision: 0.8900683252573686
Weighted-average Recall: 0.8891666666666667
Weighted-average F1-score: 0.8891466969690816


Certainly! Let's break down these metrics in detail:

### Precision, Recall, and F1-score per Class
- **Precision per class:**
  - Class 0: 0.87016977
  - Class 1: 0.90937231
  - Precision measures the proportion of true positive predictions out of all positive predictions. For class 0, 87.02% of the predicted positives are true positives. For class 1, 90.94% of the predicted positives are true positives.

- **Recall per class:**
  - Class 0: 0.91081401
  - Class 1: 0.86816615
  - Recall measures the proportion of true positive predictions out of all actual positives. For class 0, 91.08% of the actual positives are correctly identified. For class 1, 86.82% of the actual positives are correctly identified.

- **F1-score per class:**
  - Class 0: 0.89002811
  - Class 1: 0.88829162
  - The F1-score is the harmonic mean of precision and recall, providing a single metric that balances both. For class 0, the F1-score is 0.8900, and for class 1, it is 0.8883.

### Micro-Average Metrics
- **Micro-average Precision:** 0.8891666666666667
- **Micro-average Recall:** 0.8891666666666667
- **Micro-average F1-score:** 0.8891666666666667

Micro-averaging calculates the metrics globally by counting the total true positives, false negatives, and false positives. It is useful when you want to know the overall performance of your model without focusing on individual classes. Here, all three metrics are 0.8892, indicating consistent performance across all instances.

### Macro-Average Metrics
- **Macro-average Precision:** 0.8897710392727531
- **Macro-average Recall:** 0.8894900796485992
- **Macro-average F1-score:** 0.8891598653928514

Macro-averaging calculates the metrics for each class independently and then takes the average, giving equal weight to each class regardless of its frequency. This is useful for evaluating the performance of your model on each class individually and then averaging it out. The macro-average precision, recall, and F1-score are all around 0.8892, indicating that the model performs fairly consistently across different classes.

### Weighted-Average Metrics
- **Weighted-average Precision:** 0.8900683252573686
- **Weighted-average Recall:** 0.8891666666666667
- **Weighted-average F1-score:** 0.8891466969690816

Weighted-averaging calculates the metrics for each class and then takes the average, weighted by the number of instances of each class. This means that classes with more instances will have a bigger impact on the overall metric. The weighted-average precision, recall, and F1-score are very close to the micro-average scores, suggesting that the class distribution does not heavily skew the overall performance metrics.

### Summary
- **Per Class Metrics:** Shows precision, recall, and F1-score for each class individually.
- **Micro-Average:** Evaluates overall performance considering all instances equally.
- **Macro-Average:** Evaluates performance by averaging metrics for each class, treating each class equally.
- **Weighted-Average:** Evaluates performance by averaging metrics for each class, considering class frequency.

These metrics provide a comprehensive view of your model's performance, indicating that it performs consistently well across different classes and on the dataset as a whole.

#Testing different scalers and methods

This loop tests different criteria for the LogisticRegression using the previously defined test_scaler function, comparing the performance with different splitting criteria (L1, L2).



In [None]:
penalty_list = ['l1', 'l2'] #, 'elasticnet'
for penalty in penalty_list:
    if penalty == 'elasticnet':
        solver = 'saga'
        l1_ratio = 0.5  # or any other value between 0 and 1
    else:
        solver = 'liblinear'
        l1_ratio = None
    print(f"{penalty}: {test_scaler(X_train, X_test, y_train, y_test, LogisticRegression(penalty=penalty, max_iter=1500, solver=solver, l1_ratio=l1_ratio, random_state=42))}")


l1: {'Normalizer': 0.695}
l2: {'Normalizer': 0.5554166666666667}


This loop tests different criteria for the DecisionTreeClassifier using the previously defined test_scaler function, comparing the performance with different splitting criteria (gini and entropy).



In [None]:
criterion_list = ['gini', 'entropy', 'log_loss']
for criterion in criterion_list:
    print(f"{criterion}: {test_scaler(X_train, X_test, y_train, y_test, DecisionTreeClassifier(random_state=42, criterion=criterion))}")

gini: {'Normalizer': 0.8046666666666666}
entropy: {'Normalizer': 0.8156666666666667}
log_loss: {'Normalizer': 0.8156666666666667}


This loop tests different criteria for the RandomForestClassifier using the previously defined test_scaler function, comparing the performance with different splitting criteria (gini and entropy).

In [None]:
criterion_list = ['gini', 'entropy', 'log_loss']
for criterion in criterion_list:
    print(f"{criterion}: {test_scaler(X_train, X_test, y_train, y_test, RandomForestClassifier(random_state=42, criterion=criterion, n_estimators=100))}")

gini: {'Normalizer': 0.8698333333333333}
entropy: {'Normalizer': 0.8706666666666667}
log_loss: {'Normalizer': 0.8706666666666667}


This loop tests different loss functions for the GradientBoostingClassifier using the test_scaler function, comparing the performance with different loss functions (log_loss and exponential).

In [None]:
criterion_list = ['friedman_mse', 'squared_error']
for criterion in criterion_list:
    print(f"{criterion}: {test_scaler(X_train, X_test, y_train, y_test, GradientBoostingClassifier(random_state=42, criterion=criterion))}")

friedman_mse: {'Normalizer': 0.8300833333333333}
squared_error: {'Normalizer': 0.8300833333333333}


This loop tests different loss functions for the GradientBoostingClassifier using the test_scaler function, comparing the performance with different loss functions (log_loss and exponential).

In [None]:
loss_list = ['log_loss', 'exponential']
for loss in loss_list:
    print(f"{loss}: {test_scaler(X_train, X_test, y_train, y_test, GradientBoostingClassifier(random_state=42, loss=loss))}")

log_loss: {'Normalizer': 0.8300833333333333}
exponential: {'Normalizer': 0.82725}
