In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [3]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)

In [4]:
data.drop(index=0, inplace=True)
data["Гормональная активность 0-нет               1-да"] = data[
    "Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data[
"Гормональная активность 0-нет               1-да"].fillna("missing")

In [5]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
# data.dropna(axis=0, thresh=int(data.shape[1] * 0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0] * 0.5), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)

In [6]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

import optuna

def objective(trial):
    param = {
        'C': trial.suggest_float('C', 1e-1, 10.0, log=False),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
        'coef0': trial.suggest_float('coef0', 1e-1, 10.0, log=True)
    }

    pipeline = make_pipeline(
        SimpleImputer(strategy='median'),
        SVC(**param, random_state=42)
    )
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-24 16:30:35,940] A new study created in memory with name: no-name-fa23d57d-96f3-4a6e-b2bf-d1bd5322b83a
[I 2024-12-24 16:30:36,056] Trial 0 finished with value: 0.8235294117647058 and parameters: {'C': 0.808584706694697, 'gamma': 'scale', 'kernel': 'linear', 'coef0': 0.11148085968286899}. Best is trial 0 with value: 0.8235294117647058.
[I 2024-12-24 16:30:36,072] Trial 1 finished with value: 0.6470588235294118 and parameters: {'C': 1.884505885697037, 'gamma': 'auto', 'kernel': 'poly', 'coef0': 0.33725793320476377}. Best is trial 0 with value: 0.8235294117647058.
[I 2024-12-24 16:30:36,169] Trial 2 finished with value: 0.8235294117647058 and parameters: {'C': 4.86904047244377, 'gamma': 'auto', 'kernel': 'linear', 'coef0': 2.297161625776418}. Best is trial 0 with value: 0.8235294117647058.
[I 2024-12-24 16:30:36,185] Trial 3 finished with value: 0.6470588235294118 and parameters: {'C': 3.8438468650463675, 'gamma': 'auto', 'kernel': 'poly', 'coef0': 1.3505088962178322}. Best is 

In [9]:
pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    SVC(**study.best_params)
)

In [18]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=314)

cv = cross_validate(pipeline, X, y, n_jobs=-1,
                    scoring=["accuracy",
                             "f1",
                             "precision",
                             "recall"],
                    cv=skf,
                    return_estimator=True)

In [19]:
cv["test_accuracy"].mean()

0.8724264705882353

In [20]:
for i in range(len(cv)-1):
    print(
        " accuracy", cv["test_accuracy"][i], "\n",
        "f1", cv["test_f1"][i], "\n",
        "recall", cv["test_recall"][i], "\n",
        "precision", cv["test_precision"][i], "\n"
    )
    print("---------------------------------------------")

 accuracy 0.9411764705882353 
 f1 0.9333333333333333 
 recall 0.875 
 precision 1.0 

---------------------------------------------
 accuracy 0.7647058823529411 
 f1 0.7142857142857143 
 recall 0.625 
 precision 0.8333333333333334 

---------------------------------------------
 accuracy 0.9411764705882353 
 f1 0.9333333333333333 
 recall 0.875 
 precision 1.0 

---------------------------------------------
 accuracy 0.7647058823529411 
 f1 0.75 
 recall 0.75 
 precision 0.75 

---------------------------------------------
 accuracy 0.8125 
 f1 0.7692307692307693 
 recall 0.625 
 precision 1.0 

---------------------------------------------
 accuracy 1.0 
 f1 1.0 
 recall 1.0 
 precision 1.0 

---------------------------------------------
