Регрессия для CC50

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_excel("cleaned_data.xlsx", index_col=0)
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1.979535,5.173221,3.371597,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,0,...,0,0,0,0,0,0,0,0,3,0
1,0.572014,1.856738,2.079442,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,0,...,0,0,0,0,0,0,0,0,3,0
2,5.41525,5.088474,0.542324,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,0,...,0,0,0,0,0,0,0,0,3,0
3,0.995333,4.690023,3.695524,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,0,...,0,0,0,0,0,0,0,0,4,0
4,4.683348,4.943576,0.832909,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Определим целевые и обучаемые данные
X = df.drop(['IC50, mM', 'CC50, mM', 'SI'], axis=1)
y_cc50 = df['CC50, mM']

In [4]:
# Разделим данные на тестовые и тренировочные
X_train, X_test, y_train_cc50, y_test_cc50 = train_test_split(X, y_cc50, test_size=0.3, random_state=42)

# Масштабируем наши данные
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


In [5]:
X_train.shape, X_test.shape

((700, 139), (301, 139))

In [6]:
lr_cc50 = LinearRegression()
rf_cc50 = RandomForestRegressor(random_state=42)
gb_cc50 = GradientBoostingRegressor(random_state=42)

In [7]:
# Определим сетки перебора
param_grids = {}

param_grids['gb_cc50'] = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

param_grids['rf_cc50'] = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Проведем cross-validation, в качестве метрики среднеквадратичная ошибка
best_models = {}
for model_name, model in [('rf_cc50', rf_cc50), ('lr_cc50', lr_cc50), ('gb_cc50', gb_cc50)]:
  grid_search = GridSearchCV(model, param_grids.get(model_name, {}), scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
  grid_search.fit(X_train_scaled, y_train_cc50)
  best_models[model_name] = {
      'model': grid_search.best_estimator_,
      'best_params': grid_search.best_params_,
      'best_score': grid_search.best_score_
  }
  print(f"Best parameters for {model_name}: {grid_search.best_params_}")
  print(f"Best score for {model_name}: {grid_search.best_score_}")

Best parameters for rf_cc50: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score for rf_cc50: -1.4472490685190373
Best parameters for lr_cc50: {}
Best score for lr_cc50: -4.148788449950536
Best parameters for gb_cc50: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best score for gb_cc50: -1.4841260722369887


In [8]:
# Оценим обученные модели на тестовых данных
regression_models = {
    'lr_cc50': best_models['lr_cc50']['model'],
    'rf_cc50': best_models['rf_cc50']['model'],
    'gb_cc50': best_models['gb_cc50']['model'],
}

regression_targets = {
    'cc50': y_test_cc50
}

for model_name, model in regression_models.items():
    target_name = model_name.split('_')[1]
    y_test_pred = model.predict(X_test)
    r2 = r2_score(regression_targets[target_name], y_test_pred)
    mse = mean_squared_error(regression_targets[target_name], y_test_pred)
    rmse = np.sqrt(mse)
    print(f"\nModel: {model_name}")
    print(f"R-squared: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")


Model: lr_cc50
R-squared: -14589412886887407616.0000
MSE: 32381882702777688064.0000
RMSE: 5690508123.4260

Model: rf_cc50
R-squared: -0.9856
MSE: 4.4071
RMSE: 2.0993

Model: gb_cc50
R-squared: -1.9747
MSE: 6.6026
RMSE: 2.5695
