In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [None]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)
data.drop(index=0, inplace=True)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].fillna("missing")

In [None]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1]*0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0]*0.6), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)

In [None]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(loss_function = "Logloss",
                           cat_features = ["Гормональная активность 0-нет               1-да"],
                           verbose=5,
                           thread_count=-1)

In [None]:
"""from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [20, 40, 60],
    'learning_rate': [0.01, 0.1, 0.5],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)"""

In [275]:
best_model = CatBoostClassifier(iterations=20,
                                depth=10,
                                l2_leaf_reg=3,
                                learning_rate=0.1,
                                loss_function = "Logloss",
                                custom_loss=["F1", "Accuracy", "AUC", "Recall"],
                                verbose=5,
                                thread_count=-1)

In [None]:
start_idx = 0
end_idx = int(len(X)*0.8)
step = 2
while end_idx <= X.shape[0]:
    current_training_data = (X[start_idx:end_idx], y[start_idx:end_idx])
    if start_idx == 0:
        current_validating_data = (X[end_idx:], y[end_idx:])
    else:
        current_validating_data = (np.concatenate((X[end_idx:], X[:start_idx])),  np.concatenate((y[end_idx:], y[:start_idx])))
    if start_idx == 0 :
        best_model.fit(current_training_data[0], current_training_data[1], eval_set=current_validating_data, plot=True)
    else:
        best_model.fit(current_training_data[0], current_training_data[1], eval_set=current_validating_data, plot=True,
                       init_model=best_model)
    start_idx += step
    end_idx += step

In [276]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i}:")
    xtrain, xtest = X_train.iloc[train_index], X_train.iloc[test_index]
    ytrain, ytest = y_train[train_index], y_train[test_index]
    if i == 0:
        best_model.fit(xtrain, ytrain, eval_set=(xtest, ytest), plot=True)
    else:
        best_model.fit(xtrain, ytrain, eval_set=(xtest, ytest), init_model=best_model, plot=True)

Fold 0:


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6255571	test: 0.6733228	best: 0.6733228 (0)	total: 13.1ms	remaining: 248ms
5:	learn: 0.3605728	test: 0.4844073	best: 0.4844073 (5)	total: 58ms	remaining: 135ms
10:	learn: 0.2535011	test: 0.4325968	best: 0.4325968 (10)	total: 120ms	remaining: 98ms
15:	learn: 0.1759936	test: 0.3869786	best: 0.3869786 (15)	total: 175ms	remaining: 43.7ms
19:	learn: 0.1299474	test: 0.3233390	best: 0.3233390 (19)	total: 205ms	remaining: 0us

bestTest = 0.3233390364
bestIteration = 19

Fold 1:


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.1699543	test: 0.1268378	best: 0.1268378 (0)	total: 12.1ms	remaining: 231ms
5:	learn: 0.1298347	test: 0.1161036	best: 0.1161036 (5)	total: 64.9ms	remaining: 151ms
10:	learn: 0.1038303	test: 0.1113384	best: 0.1113384 (10)	total: 114ms	remaining: 93.3ms
15:	learn: 0.0853426	test: 0.1052391	best: 0.1052391 (15)	total: 160ms	remaining: 39.9ms
19:	learn: 0.0747605	test: 0.1036375	best: 0.1036375 (19)	total: 195ms	remaining: 0us

bestTest = 0.1036374785
bestIteration = 19

Fold 2:


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0821973	test: 0.0617478	best: 0.0617478 (0)	total: 10.3ms	remaining: 195ms
5:	learn: 0.0703678	test: 0.0571484	best: 0.0571484 (5)	total: 52.7ms	remaining: 123ms
10:	learn: 0.0599429	test: 0.0537117	best: 0.0537117 (10)	total: 92.1ms	remaining: 75.4ms
15:	learn: 0.0511397	test: 0.0501905	best: 0.0501905 (15)	total: 138ms	remaining: 34.6ms
19:	learn: 0.0465602	test: 0.0491148	best: 0.0491148 (19)	total: 176ms	remaining: 0us

bestTest = 0.04911478707
bestIteration = 19

Fold 3:


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0480806	test: 0.0391918	best: 0.0391918 (0)	total: 10.4ms	remaining: 198ms
5:	learn: 0.0432845	test: 0.0385988	best: 0.0385988 (5)	total: 60.8ms	remaining: 142ms
10:	learn: 0.0391574	test: 0.0383335	best: 0.0383335 (10)	total: 113ms	remaining: 92.5ms
15:	learn: 0.0357416	test: 0.0381091	best: 0.0381091 (15)	total: 162ms	remaining: 40.5ms
19:	learn: 0.0332635	test: 0.0377223	best: 0.0375750 (17)	total: 202ms	remaining: 0us

bestTest = 0.03757501655
bestIteration = 17

Shrink model to first 18 iterations.
Fold 4:


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0361078	test: 0.0277129	best: 0.0277129 (0)	total: 10ms	remaining: 190ms
5:	learn: 0.0330275	test: 0.0270366	best: 0.0270366 (5)	total: 59.1ms	remaining: 138ms
10:	learn: 0.0294106	test: 0.0247587	best: 0.0247587 (10)	total: 85.1ms	remaining: 69.6ms
15:	learn: 0.0272333	test: 0.0239442	best: 0.0239442 (15)	total: 131ms	remaining: 32.8ms
19:	learn: 0.0255199	test: 0.0234402	best: 0.0234402 (19)	total: 165ms	remaining: 0us

bestTest = 0.02344023861
bestIteration = 19



In [277]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.9090909090909091
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.86      0.92         7

    accuracy                           0.91        11
   macro avg       0.90      0.93      0.91        11
weighted avg       0.93      0.91      0.91        11



In [274]:
feature_importances = best_model.get_feature_importance()
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
3,Weiss,26.672042
1,"Максимальный размер, мм",14.815787
28,16dP2_3A,7.656574
12,Индекс свободный кортизон/свободный кортизол в...,4.182032
48,THF_THE1,2.502675
49,THF_THE,2.409132
37,alloTHF,2.328603
14,Et,2.259443
4,Кортизол крови утро,2.228118
51,aTHB_THB,2.189375
