In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import optuna
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics import adjusted_rand_score
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")

*Воспользуемся обработкой файла из прошлого домашнего задания и применим оверсемплинг, так как есть сильна перебалансировка классов*

In [None]:
df = pd.read_csv("/content/drive/MyDrive/healthcare-dataset-stroke-data.csv", usecols = lambda x: x != "id")

In [None]:
numerical = ["age", "avg_glucose_level", "bmi"]
categorical = [column for column in df.columns if (column not in numerical)]

df["bmi"] = df["bmi"].fillna(df["bmi"].median())

encoder = OneHotEncoder(drop = "first")
encoded_features = encoder.fit_transform(df[categorical[:-1]])
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(categorical[:-1]))
df = pd.concat([df.drop(columns = categorical[:-1]), encoded_df], axis = 1)

X = df.drop("stroke", axis = 1)
y = df["stroke"]

smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

*С помощью optuna подберём гиперпараметры для максимизации метрики ARI и воспользуемся тремя методами класторизации: KMeans, DBSCAN, OPTICS*

In [None]:
def optimize_kmeans(trial):
    n_clusters = trial.suggest_int("n_clusters", 2, 10)
    kmeans = KMeans(n_clusters = n_clusters, random_state = 42)
    kmeans.fit(X_train)
    labels = kmeans.predict(X_test)
    return adjusted_rand_score(y_test, labels)

def optimize_dbscan(trial):
    eps = trial.suggest_float("eps", 0.1, 1.0, step = 0.1)
    min_samples = trial.suggest_int("min_samples", 3, 20)
    dbscan = DBSCAN(eps = eps, min_samples = min_samples)
    labels = dbscan.fit_predict(X_test)
    if len(set(labels)) == 1:
        return -1.0
    return adjusted_rand_score(y_test, labels)

def optimize_optics(trial):
    min_samples = trial.suggest_int("min_samples", 3, 20)
    max_eps = trial.suggest_float("max_eps", 0.1, 2.0, step = 0.1)
    optics = OPTICS(min_samples = min_samples, max_eps = max_eps)
    labels = optics.fit_predict(X_test)
    if len(set(labels)) == 1:
        return -1.0
    return adjusted_rand_score(y_test, labels)

In [None]:
study_kmeans = optuna.create_study(direction="maximize")
study_kmeans.optimize(optimize_kmeans, n_trials=20)

study_dbscan = optuna.create_study(direction="maximize")
study_dbscan.optimize(optimize_dbscan, n_trials=20)

study_optics = optuna.create_study(direction="maximize")
study_optics.optimize(optimize_optics, n_trials=20)

[I 2024-12-21 22:16:55,651] A new study created in memory with name: no-name-15910e78-7dfc-453a-a6ea-dc258e87cd04
[I 2024-12-21 22:16:55,689] Trial 0 finished with value: 0.038318757485446976 and parameters: {'n_clusters': 8}. Best is trial 0 with value: 0.038318757485446976.
[I 2024-12-21 22:16:55,725] Trial 1 finished with value: 0.047731975654157745 and parameters: {'n_clusters': 6}. Best is trial 1 with value: 0.047731975654157745.
[I 2024-12-21 22:16:55,768] Trial 2 finished with value: 0.03857334138281314 and parameters: {'n_clusters': 9}. Best is trial 1 with value: 0.047731975654157745.
[I 2024-12-21 22:16:55,806] Trial 3 finished with value: 0.0223805734074346 and parameters: {'n_clusters': 3}. Best is trial 1 with value: 0.047731975654157745.
[I 2024-12-21 22:16:55,871] Trial 4 finished with value: 0.047731975654157745 and parameters: {'n_clusters': 6}. Best is trial 1 with value: 0.047731975654157745.
[I 2024-12-21 22:16:55,930] Trial 5 finished with value: 0.038573341382813

In [None]:
study_kmeans.best_params, study_dbscan.best_params, study_optics.best_params

({'n_clusters': 4},
 {'eps': 1.0, 'min_samples': 11},
 {'min_samples': 18, 'max_eps': 1.3000000000000003})

In [None]:
kmeans_best = KMeans(**study_kmeans.best_params, random_state = 42)
kmeans_best.fit(X_train)
kmeans_best_labels = kmeans_best.predict(X_test)
adjusted_rand_score(y_test, kmeans_best_labels)

0.05708070812955797

In [None]:
dbscan_best = DBSCAN(**study_dbscan.best_params)
dbscan_best.fit(X_train)
dbscan_labels = dbscan_best.fit_predict(X_test)
adjusted_rand_score(y_test, dbscan_labels)

0.05005276600170577

In [None]:
optics_best = OPTICS(**study_optics.best_params)
optics_best.fit(X_train)
optics_labels = optics_best.fit_predict(X_test)
adjusted_rand_score(y_test, optics_labels)

0.0541290835708016

*Даже с использованием оверсемплинг кластеризация происходит очень плохо*