In [35]:
import pandas as pd
import numpy as np

In [36]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [37]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)
data.drop(index=0, inplace=True)

In [38]:
data["Гормональная активность 0-нет               1-да"] = data[
    "Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data[
"Гормональная активность 0-нет               1-да"].fillna("missing")

In [39]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1] * 0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0] * 0.6), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)

In [40]:
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

In [78]:
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

def objective(trial):
    param = {
        "C":trial.suggest_float('C', 1e-5, 1e5, log=True),
        "max_iter":trial.suggest_int('max_iter', 300, 2000),
        "solver":trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga']),
        "penalty":trial.suggest_categorical('penalty', ['l2']),
        'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
    }

    model = make_pipeline(
        SimpleImputer(strategy="median"),
        LogisticRegression(**param, random_state=42)
    )

    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-22 14:20:44,743] A new study created in memory with name: no-name-7420c1ea-9643-4159-9fa5-b43de17ac9dc
[I 2024-12-22 14:20:44,777] Trial 0 finished with value: 0.9090909090909091 and parameters: {'C': 86.30376378357053, 'max_iter': 710, 'solver': 'saga', 'penalty': 'l2', 'fit_intercept': False}. Best is trial 0 with value: 0.9090909090909091.
[I 2024-12-22 14:20:44,842] Trial 1 finished with value: 0.9090909090909091 and parameters: {'C': 0.056136943443333434, 'max_iter': 1515, 'solver': 'lbfgs', 'penalty': 'l2', 'fit_intercept': True}. Best is trial 0 with value: 0.9090909090909091.
[I 2024-12-22 14:20:44,853] Trial 2 finished with value: 0.9090909090909091 and parameters: {'C': 388.5951768835664, 'max_iter': 1001, 'solver': 'liblinear', 'penalty': 'l2', 'fit_intercept': True}. Best is trial 0 with value: 0.9090909090909091.
[I 2024-12-22 14:20:44,862] Trial 3 finished with value: 0.9090909090909091 and parameters: {'C': 0.00014776767342647128, 'max_iter': 348, 'solver': 'l

In [79]:
model = make_pipeline(
    SimpleImputer(strategy="median"),
    LogisticRegression(**study.best_params)
)

In [80]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i}:")
    xtrain, xtest = X_train.iloc[train_index], X_train.iloc[test_index]
    ytrain, ytest = y_train[train_index], y_train[test_index]
    model.fit(xtrain, ytrain)

    y_pred = model.predict(xtest)

    print("Accuracy:", accuracy_score(ytest, y_pred))
    print("F1:", f1_score(ytest, y_pred))
    print("Precision:", precision_score(ytest, y_pred))
    print("Recall:", recall_score(ytest, y_pred))
    print()

Fold 0:
Accuracy: 0.7894736842105263
F1: 0.75
Precision: 0.75
Recall: 0.75

Fold 1:
Accuracy: 0.7777777777777778
F1: 0.75
Precision: 0.75
Recall: 0.75

Fold 2:
Accuracy: 0.8333333333333334
F1: 0.8235294117647058
Precision: 0.7777777777777778
Recall: 0.875

Fold 3:
Accuracy: 0.7777777777777778
F1: 0.6666666666666666
Precision: 1.0
Recall: 0.5

Fold 4:
Accuracy: 0.8888888888888888
F1: 0.875
Precision: 0.875
Recall: 0.875



In [81]:
model.fit(X_train, y_train)

In [82]:
from sklearn.model_selection import cross_validate

cv = cross_validate(model, X, y, n_jobs=-1,
                    scoring=["accuracy",
                             "f1",
                             "precision",
                             "recall"])

In [83]:
for i in range(len(cv)-1):
    print(
        " accuracy", cv["test_accuracy"][i], "\n",
        "f1", cv["test_f1"][i], "\n",
        "recall", cv["test_recall"][i], "\n",
        "precision", cv["test_precision"][i], "\n"
    )
    print("---------------------------------------------")

 accuracy 0.9523809523809523 
 f1 0.9473684210526315 
 recall 0.9 
 precision 1.0 

---------------------------------------------
 accuracy 0.8095238095238095 
 f1 0.75 
 recall 0.6 
 precision 1.0 

---------------------------------------------
 accuracy 0.7 
 f1 0.7 
 recall 0.7777777777777778 
 precision 0.6363636363636364 

---------------------------------------------
 accuracy 0.8 
 f1 0.75 
 recall 0.6666666666666666 
 precision 0.8571428571428571 

---------------------------------------------
 accuracy 0.7 
 f1 0.7 
 recall 0.7777777777777778 
 precision 0.6363636363636364 

---------------------------------------------


In [84]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         7

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [85]:
lr = model["logisticregression"]

feature_importances = lr.coef_.T.reshape(55,)
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
15,dA2_17B,0.410032
32,THS,0.323919
28,16dP2_3A,0.277667
17,16DHEA-3a,0.23971
10,свободный кортизон мочи (ВЭЖХ),0.23486
44,HHB,0.206659
35,alloTHB,0.187273
14,Et,0.167964
27,dP3_3А,0.146995
40,bCN,0.126973
