Классификация: превышает ли значение SI медианное значение выборки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df = pd.read_excel("cleaned_data.xlsx", index_col=0)
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1.979535,5.173221,3.371597,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,0,...,0,0,0,0,0,0,0,0,3,0
1,0.572014,1.856738,2.079442,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,0,...,0,0,0,0,0,0,0,0,3,0
2,5.41525,5.088474,0.542324,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,0,...,0,0,0,0,0,0,0,0,3,0
3,0.995333,4.690023,3.695524,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,0,...,0,0,0,0,0,0,0,0,4,0
4,4.683348,4.943576,0.832909,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Определим целевые и обучаемые данные
X = df.drop(df.columns[2:], axis=1)
y_si = df['SI']

In [4]:
# Подсчитаем медианы для целевой переменной
median_si = y_si.median()

In [5]:
# Создаем классы
y_class_si_median = (y_si > median_si).astype(int)

In [6]:
# Разделим данные на тестовые и тренировочные
X_train, X_test, y_train_class_si_med, y_test_class_si_med = train_test_split(X, y_class_si_median, test_size=0.3, random_state=42, stratify=y_class_si_median)

# Масштабируем наши данные
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [7]:
lr_class_si_med = LogisticRegression(random_state=42)

rf_class_si_med = RandomForestClassifier(random_state=42)

gb_class_si_med = GradientBoostingClassifier(random_state=42)

svc_class_si_med = SVC(random_state=42)

In [8]:
# Определим сетки перебора
param_grids = {}

param_grids['gb_class_si_med'] = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

param_grids['rf_class_si_med'] = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grids['svc_class_si_med'] = {'C': [0.1, 1, 10, 100], 
			'gamma': [1, 0.1, 0.01, 0.001], 
			'kernel': ['rbf']} 

# Проведем cross-validation, в качестве метрики accuracy
best_models = {}
for model_name, model, target in [
    ('lr_class_si_med', lr_class_si_med, y_train_class_si_med),
    ('rf_class_si_med', rf_class_si_med, y_train_class_si_med),
    ('gb_class_si_med', gb_class_si_med, y_train_class_si_med),
    ('svc_class_si_med', svc_class_si_med, y_train_class_si_med)]:
    grid_search = GridSearchCV(model, param_grids.get(model_name, {}), scoring='accuracy', cv=5)
    grid_search.fit(X_train_scaled, target)
    best_models[model_name] = {
        'model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

Best parameters for lr_class_si_med: {}
Best score for lr_class_si_med: 0.9342857142857144
Best parameters for rf_class_si_med: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best score for rf_class_si_med: 0.9685714285714286
Best parameters for gb_class_si_med: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Best score for gb_class_si_med: 0.9642857142857142
Best parameters for svc_class_si_med: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Best score for svc_class_si_med: 0.9885714285714287


In [9]:
# Оценим обученные модели на тестовых данных
classification_models = {
    'lr_class_si_med': best_models['lr_class_si_med']['model'],
    'rf_class_si_med': best_models['rf_class_si_med']['model'],
    'gb_class_si_med': best_models['gb_class_si_med']['model'],
    'svc_class_si_med': best_models['svc_class_si_med']['model']
}

classification_targets = {
    'si': y_test_class_si_med
}

for model_name, model in classification_models.items():
    target_name = '_'.join(model_name.split('_')[2:]) if 'median' in model_name or '8' in model_name else model_name.split('_')[2]
    y_test_pred = model.predict(X_test_scaled)
    y_test_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None

    accuracy = accuracy_score(classification_targets[target_name], y_test_pred)
    precision = precision_score(classification_targets[target_name], y_test_pred)
    recall = recall_score(classification_targets[target_name], y_test_pred)
    f1 = f1_score(classification_targets[target_name], y_test_pred)
    roc_auc = roc_auc_score(classification_targets[target_name], y_test_pred_proba) if y_test_pred_proba is not None else 'N/A'

    print(f"\nModel: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc}")


Model: lr_class_si_med
Accuracy: 0.9336
Precision: 0.9710
Recall: 0.8933
F1-score: 0.9306
ROC AUC: 0.9901545253863135

Model: rf_class_si_med
Accuracy: 0.9734
Precision: 0.9863
Recall: 0.9600
F1-score: 0.9730
ROC AUC: 0.9972406181015452

Model: gb_class_si_med
Accuracy: 0.9668
Precision: 0.9730
Recall: 0.9600
F1-score: 0.9664
ROC AUC: 0.9909492273730685

Model: svc_class_si_med
Accuracy: 0.9934
Precision: 1.0000
Recall: 0.9867
F1-score: 0.9933
ROC AUC: N/A
