In [231]:
import pandas as pd
import numpy as np

In [232]:
data = pd.read_excel("tumor-data-without-healthy.xlsx")

In [233]:
labels = [value for column, value in data.iloc[0].items()]
data = data.set_axis(labels, axis=1)
data = data[data.columns[1:]].apply(
    lambda x: pd.to_numeric(x.replace('<', '').replace(',', '.'), errors='coerce')
)

In [234]:
data.drop(index=0, inplace=True)
data["Гормональная активность 0-нет               1-да"] = data[
    "Гормональная активность 0-нет               1-да"].astype(str)
data["Гормональная активность 0-нет               1-да"] = data[
"Гормональная активность 0-нет               1-да"].fillna("missing")

In [235]:
""" dropping some columns """

data.dropna(axis=1, how="all", inplace=True)
data.dropna(axis=0, thresh=int(data.shape[1] * 0.6), inplace=True)
data.dropna(axis=1, thresh=int(data.shape[0] * 0.6), inplace=True)
data.drop(["Пол:    0-жен, 1-муж"], inplace=True, axis=1)
X = data.drop(columns=["КАН", "АКР"])
y = np.argmax(data[["КАН", "АКР"]], axis=1)

In [236]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

In [386]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    SVC(C=1, gamma="scale", kernel="linear")
)

In [383]:
pipeline.fit(X_train, y_train)

In [387]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i}:")
    xtrain, xtest = X_train.iloc[train_index], X_train.iloc[test_index]
    ytrain, ytest = y_train[train_index], y_train[test_index]
    pipeline.fit(xtrain, ytrain)

    y_pred = pipeline.predict(xtest)

    print("Accuracy:", accuracy_score(ytest, y_pred))
    print("F1:", f1_score(ytest, y_pred))
    print("Precision:", precision_score(ytest, y_pred))
    print("Recall:", recall_score(ytest, y_pred))
    print()

Fold 0:
Accuracy: 0.7368421052631579
F1: 0.7058823529411765
Precision: 0.6666666666666666
Recall: 0.75

Fold 1:
Accuracy: 0.7777777777777778
F1: 0.7777777777777778
Precision: 0.7
Recall: 0.875

Fold 2:
Accuracy: 0.7777777777777778
F1: 0.75
Precision: 0.75
Recall: 0.75

Fold 3:
Accuracy: 0.7777777777777778
F1: 0.7142857142857143
Precision: 0.8333333333333334
Recall: 0.625

Fold 4:
Accuracy: 0.7777777777777778
F1: 0.7777777777777778
Precision: 0.7
Recall: 0.875



In [388]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.9090909090909091
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.86      0.92         7

    accuracy                           0.91        11
   macro avg       0.90      0.93      0.91        11
weighted avg       0.93      0.91      0.91        11



In [389]:
model = pipeline["svc"]

feature_importances = model.coef_.T.reshape(55,)
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
10,свободный кортизон мочи (ВЭЖХ),0.004286289
15,dA2_17B,0.004244245
27,dP3_3А,0.002716458
32,THS,0.002389717
44,HHB,0.002388691
5,Кортизол крови вечер,0.002185041
35,alloTHB,0.002029986
2,"Нативная плотность, НU",0.002017247
17,16DHEA-3a,0.001787526
14,Et,0.001605701
