In [125]:
import pandas as pd
import numpy as np

In [126]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [127]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)
data.drop(index=0, inplace=True)

In [128]:
data["Гормональная активность 0-нет               1-да"] = data[
    "Гормональная активность 0-нет               1-да"].astype(float)
data.rename(columns={"Гормональная активность 0-нет               1-да":"hormones_activity"}, inplace=True)

In [129]:
data.columns = data.columns.astype(str)
data.columns = [col.replace(' ', '_').replace('-', '_').replace(':', '').replace('(', '').replace(')', '').replace(',', '') for col in data.columns]

In [130]:
""" dropping some columns """
data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1] * 0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0] * 0.6), inplace=True)
data.drop(["Пол____0_жен_1_муж"], inplace=True, axis=1)

In [131]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [132]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=314)

In [133]:
from lightgbm import LGBMClassifier
import optuna

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    model = LGBMClassifier(**param, n_jobs=-1, random_state=314)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-22 13:55:58,911] A new study created in memory with name: no-name-186fa0e3-f8ee-4426-8847-79a73868f159
[I 2024-12-22 13:55:58,931] Trial 0 finished with value: 0.36363636363636365 and parameters: {'lambda_l1': 1.6335122896687013, 'lambda_l2': 3.7325920899212554e-07, 'num_leaves': 70, 'feature_fraction': 0.6095973650932056, 'bagging_fraction': 0.703892067480292, 'bagging_freq': 6, 'min_child_samples': 83}. Best is trial 0 with value: 0.36363636363636365.
[I 2024-12-22 13:55:58,945] Trial 1 finished with value: 0.36363636363636365 and parameters: {'lambda_l1': 3.763124193920426e-05, 'lambda_l2': 3.590377470801795, 'num_leaves': 207, 'feature_fraction': 0.8035048878528352, 'bagging_fraction': 0.7978918386205073, 'bagging_freq': 5, 'min_child_samples': 65}. Best is trial 0 with value: 0.36363636363636365.
[I 2024-12-22 13:55:58,959] Trial 2 finished with value: 0.36363636363636365 and parameters: {'lambda_l1': 5.481522531247009e-07, 'lambda_l2': 0.0005646856508541937, 'num_leave

In [134]:
study.best_params, study.best_value

({'lambda_l1': 0.006822516771075539,
  'lambda_l2': 8.959515710646349e-07,
  'num_leaves': 28,
  'feature_fraction': 0.5360804548923979,
  'bagging_fraction': 0.7864415848691042,
  'bagging_freq': 1,
  'min_child_samples': 29},
 1.0)

In [135]:
model = LGBMClassifier(**study.best_params, random_state=314)

In [143]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=63)

cv = cross_validate(model, X, y, n_jobs=-1,
                    scoring=["accuracy",
                             "f1",
                             "precision",
                             "recall"], cv=skf)
for i in range(len(cv) - 1):
    print(
        " accuracy", cv["test_accuracy"][i], "\n",
        "f1", cv["test_f1"][i], "\n",
        "recall", cv["test_recall"][i], "\n",
        "precision", cv["test_precision"][i], "\n"
    )
    print("---------------------------------------------")

 accuracy 0.9523809523809523 
 f1 0.9473684210526315 
 recall 0.9 
 precision 1.0 

---------------------------------------------
 accuracy 0.9 
 f1 0.8888888888888888 
 recall 0.8888888888888888 
 precision 0.8888888888888888 

---------------------------------------------
 accuracy 0.95 
 f1 0.9473684210526315 
 recall 1.0 
 precision 0.9 

---------------------------------------------
 accuracy 0.95 
 f1 0.9411764705882353 
 recall 0.8888888888888888 
 precision 1.0 

---------------------------------------------
 accuracy 0.95 
 f1 0.9411764705882353 
 recall 0.8888888888888888 
 precision 1.0 

---------------------------------------------


In [144]:
model.fit(X_train, y_train)

In [146]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         7

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

