In [1]:
import math
from pathlib import Path

import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

from src import datasource

sns.set_theme()
%matplotlib inline

In [2]:
MAIN_DATA_DIR = "../data"
data_dir = f"{MAIN_DATA_DIR}/all_data"
labels_file = Path(data_dir, "labels.csv")

In [3]:
df_data = datasource.get_data_frame()
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4976 entries, 0 to 4975
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   class     4976 non-null   object
 1   filename  4976 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


In [4]:
train, val, test = datasource.get_train_validate_test(df_data=df_data)

In [5]:
features = ["asymmetry", "border_irregularity", "color_variation", "diameter", "abcd_score"]
train_df = datasource.build_tabular_data(train)
val_df = datasource.build_tabular_data(val)
test_df = datasource.build_tabular_data(test)

X_train = train_df[features]
y_train = train_df["label"]
X_val = val_df[features]
y_val = val_df["label"]
X_test = test_df[features]
y_test = test_df["label"]

In [6]:
import xgboost as xgb
from sklearn.metrics import f1_score

# Преобразуем данные в DMatrix, который является основным форматом данных для XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Если у вас трёхклассовая задача, можно использовать:
params = {
    "objective": "multi:softmax",
    "eval_metric": "mlogloss",
    "eta": 0.1,
    "max_depth": 6,
    "seed": 42,
    "num_class": 3,
}

evals = [(dtrain, "train"), (dval, "eval")]

# Обучаем модель
num_rounds = 1000
model_xgb = xgb.train(params, dtrain, num_boost_round=num_rounds, evals=evals, early_stopping_rounds=50)

# Предсказываем на тестовом наборе
y_pred = model_xgb.predict(dtest)
f1 = f1_score(y_test, y_pred, average="weighted")
print("F1-score:", f1)

[0]	train-mlogloss:0.99264	eval-mlogloss:0.99414
[1]	train-mlogloss:0.90364	eval-mlogloss:0.90672
[2]	train-mlogloss:0.82813	eval-mlogloss:0.83339
[3]	train-mlogloss:0.76235	eval-mlogloss:0.77058
[4]	train-mlogloss:0.70565	eval-mlogloss:0.71641
[5]	train-mlogloss:0.65593	eval-mlogloss:0.66889
[6]	train-mlogloss:0.61206	eval-mlogloss:0.62757
[7]	train-mlogloss:0.57349	eval-mlogloss:0.59169
[8]	train-mlogloss:0.53869	eval-mlogloss:0.55944
[9]	train-mlogloss:0.50745	eval-mlogloss:0.53099
[10]	train-mlogloss:0.47991	eval-mlogloss:0.50630
[11]	train-mlogloss:0.45591	eval-mlogloss:0.48387
[12]	train-mlogloss:0.43435	eval-mlogloss:0.46327
[13]	train-mlogloss:0.41512	eval-mlogloss:0.44528
[14]	train-mlogloss:0.39804	eval-mlogloss:0.43016
[15]	train-mlogloss:0.38230	eval-mlogloss:0.41588
[16]	train-mlogloss:0.36819	eval-mlogloss:0.40310
[17]	train-mlogloss:0.35541	eval-mlogloss:0.39189
[18]	train-mlogloss:0.34360	eval-mlogloss:0.38212
[19]	train-mlogloss:0.33313	eval-mlogloss:0.37331
[20]	train