In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [3]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)
data.drop(index=0, inplace=True)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data["Гормональная активность 0-нет               1-да"].fillna("missing")

In [4]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1]*0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0]*0.6), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)

In [5]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [7]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(loss_function = "Logloss",
                           cat_features = ["Гормональная активность 0-нет               1-да"],
                           verbose=5,
                           thread_count=-1)

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [10, 20, 30],
    'learning_rate': [0.01, 0.1, 0.5],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

0:	learn: 0.6163453	total: 203ms	remaining: 3.86s
5:	learn: 0.3845463	total: 555ms	remaining: 1.29s
10:	learn: 0.2562240	total: 915ms	remaining: 748ms
15:	learn: 0.1754480	total: 1.18s	remaining: 296ms
19:	learn: 0.1378558	total: 1.41s	remaining: 0us
{'depth': 10, 'iterations': 20, 'l2_leaf_reg': 3, 'learning_rate': 0.1}


In [8]:
best_model = CatBoostClassifier(iterations=100,
                                depth=10,
                                l2_leaf_reg=3,
                                learning_rate=0.1,
                                loss_function = "Logloss",
                                custom_loss=["AUC", "Accuracy"],
                                cat_features = ["Гормональная активность 0-нет               1-да"],
                                verbose=5,
                                thread_count=-1)

In [9]:
best_model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6163453	test: 0.6327289	best: 0.6327289 (0)	total: 171ms	remaining: 3.24s
5:	learn: 0.3845463	test: 0.5569464	best: 0.5569464 (5)	total: 418ms	remaining: 974ms
10:	learn: 0.2562240	test: 0.4942507	best: 0.4942507 (10)	total: 656ms	remaining: 537ms
15:	learn: 0.1754480	test: 0.4447519	best: 0.4447519 (15)	total: 874ms	remaining: 219ms
19:	learn: 0.1378558	test: 0.4280364	best: 0.4280364 (19)	total: 1.08s	remaining: 0us

bestTest = 0.4280363607
bestIteration = 19



<catboost.core.CatBoostClassifier at 0x2cdbc9c1cc0>