In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [5]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)
data.drop(index=0, inplace=True)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].fillna("missing")

In [6]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1]*0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0]*0.6), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)

In [7]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=314, shuffle=True, stratify=True)

In [9]:
from catboost import CatBoostClassifier
import optuna

def objective(trial):
    param = {
    'verbose':0,
    'thread_count':-1,
    'cat_features':["Гормональная активность 0-нет               1-да"],
    'loss_function': "Logloss",
    'iterations': trial.suggest_int('iterations', 10, 100),
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
    'depth': trial.suggest_int('depth', 3, 10),
    'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
    'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 0, 8)
    }

    model = CatBoostClassifier(**param, task_type='GPU', random_state=314)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-22 14:02:40,756] A new study created in memory with name: no-name-2ca072f9-f0a0-44e5-8c6f-58d1fbf2e5b8
[I 2024-12-22 14:02:42,145] Trial 0 finished with value: 1.0 and parameters: {'iterations': 67, 'learning_rate': 0.010160552336636488, 'depth': 4, 'l2_leaf_reg': 2, 'boosting_type': 'Plain'}. Best is trial 0 with value: 1.0.
[I 2024-12-22 14:02:48,536] Trial 1 finished with value: 0.9090909090909091 and parameters: {'iterations': 90, 'learning_rate': 0.004453115456053704, 'depth': 10, 'l2_leaf_reg': 9, 'boosting_type': 'Ordered'}. Best is trial 0 with value: 1.0.
[I 2024-12-22 14:02:52,552] Trial 2 finished with value: 1.0 and parameters: {'iterations': 91, 'learning_rate': 0.004384164385033835, 'depth': 9, 'l2_leaf_reg': 3, 'boosting_type': 'Ordered'}. Best is trial 0 with value: 1.0.
[I 2024-12-22 14:02:56,378] Trial 3 finished with value: 1.0 and parameters: {'iterations': 98, 'learning_rate': 0.01595076638051762, 'depth': 8, 'l2_leaf_reg': 1, 'boosting_type': 'Ordered'}

In [18]:
model = CatBoostClassifier(**study.best_params, random_state=314)

In [19]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(shuffle=True, n_splits=5, random_state=314)

cv = cross_validate(model, X, y, n_jobs=-1,
                    scoring=["accuracy",
                             "f1",
                             "precision",
                             "recall"],
                    cv=skf)

In [22]:
for i in range(len(cv)-1):
    print(
        " accuracy", cv["test_accuracy"][i], "\n",
        "f1", cv["test_f1"][i], "\n",
        "recall", cv["test_recall"][i], "\n",
        "precision", cv["test_precision"][i], "\n"
    )
    print("---------------------------------------------")

 accuracy 0.9047619047619048 
 f1 0.9 
 recall 0.9 
 precision 0.9 

---------------------------------------------
 accuracy 0.9047619047619048 
 f1 0.8888888888888888 
 recall 0.8 
 precision 1.0 

---------------------------------------------
 accuracy 1.0 
 f1 1.0 
 recall 1.0 
 precision 1.0 

---------------------------------------------
 accuracy 0.95 
 f1 0.9473684210526315 
 recall 1.0 
 precision 0.9 

---------------------------------------------
 accuracy 0.9 
 f1 0.8888888888888888 
 recall 0.8888888888888888 
 precision 0.8888888888888888 

---------------------------------------------


In [23]:
model.fit(X_train, y_train)

0:	learn: 0.6794216	total: 1.13ms	remaining: 74.5ms
1:	learn: 0.6660967	total: 2.42ms	remaining: 78.6ms
2:	learn: 0.6578233	total: 3.28ms	remaining: 70ms
3:	learn: 0.6435387	total: 4.12ms	remaining: 64.8ms
4:	learn: 0.6331872	total: 4.95ms	remaining: 61.3ms
5:	learn: 0.6227694	total: 5.87ms	remaining: 59.7ms
6:	learn: 0.6133301	total: 6.75ms	remaining: 57.9ms
7:	learn: 0.6022798	total: 7.67ms	remaining: 56.6ms
8:	learn: 0.5929776	total: 8.78ms	remaining: 56.6ms
9:	learn: 0.5817819	total: 9.86ms	remaining: 56.2ms
10:	learn: 0.5702668	total: 10.7ms	remaining: 54.5ms
11:	learn: 0.5608023	total: 11.6ms	remaining: 53.2ms
12:	learn: 0.5502164	total: 12.5ms	remaining: 51.9ms
13:	learn: 0.5400840	total: 13.4ms	remaining: 50.5ms
14:	learn: 0.5323064	total: 14.3ms	remaining: 49.5ms
15:	learn: 0.5241583	total: 15.3ms	remaining: 48.8ms
16:	learn: 0.5165194	total: 16.2ms	remaining: 47.6ms
17:	learn: 0.5077651	total: 17ms	remaining: 46.3ms
18:	learn: 0.4983267	total: 17.8ms	remaining: 45.1ms
19:	lea

<catboost.core.CatBoostClassifier at 0x2e27b485c60>

In [24]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         7

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [25]:
feature_importances = model.get_feature_importance()
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
3,Weiss,50.021715
1,"Максимальный размер, мм",7.50131
14,Et,5.657164
27,dP3_3А,3.547257
50,aTHF_THF,2.364783
24,P2,2.341994
26,dP2,2.046051
32,THS,1.898927
18,16DHEA-3b,1.384081
9,свободный кортизол мочи (ВЭЖХ),1.194525
