In [1]:
import json
import pickle
import pprint

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Загрузка данных

Воспользуемся функцией `load_breast_cancer` из библиотеки `sklearn` для загрузки данных. Сохраним эти данные в `.csv` файл для дальнейшей работы с ними в `airflow` пайплайнах.

In [2]:
data = load_breast_cancer(return_X_y=False, as_frame=True)
data["frame"].to_csv(f"results/{data['filename']}")
df = data["frame"]

# EDA

## Основные статистики

In [3]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


## Типы данных и пропуски
убедимся что в датасете отсутствуют пропуски и все типы данных являются числовыми

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

# Предобработка данных
Отделим целевую переменную от набора данных. Произведём стандартизацию а так же разделим все данные на `train` и `test` выборки для обучения и валидации модели.
Сохраним стандартизированные и разбитые данные как артефакты.

In [5]:
X, y = data["data"], data["target"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)
np.save("results/X_scaled.npy", X_scaled)
np.save("results/X_train.npy", X_train)
np.save("results/X_test.npy", X_test)
np.save("results/y_train.npy", y_train)
np.save("results/y_test.npy", y_test)

# Построение модели
Создадим и обучим модель логистической регрессии. Саму модель и метрики полученные на тестовой выборке сохраним в файлы как артефакты работы.

In [6]:
model = LogisticRegression(random_state=42, max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
}
pprint.pprint(results)
# сохраним результаты предсказаний
np.save("results/y_pred.npy", y_pred)
# сохраним результаты метрик полученных на тестовой выборке
with open("results/metrics.json", "w") as f:
    json.dump(results, f)
# сохраним саму модель как pickle файл
with open("results/logistic_regression.pickle", "wb") as f:
    pickle.dump(model, f)

{'accuracy': 0.956140350877193,
 'f1_score': 0.9655172413793104,
 'precision': 0.9459459459459459,
 'recall': 0.9859154929577465}
