In [2]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MiniBatchKMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import hdbscan
from minisom import MiniSom
import numpy as np

# Create clster

In [17]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def is_minmax_scaled(X: np.ndarray, eps: float = 1e-3) -> bool:
    """
    Проверяет, находятся ли все значения массива X в диапазоне [0 - eps, 1 + eps]
    """
    return np.all(X >= (0 - eps)) and np.all(X <= (1 + eps))


def add_clusters(df: pd.DataFrame, features: list[str], method: str, cluster_col: str = None, **kwargs) -> pd.DataFrame:
    """
    Добавляет кластерные метки в DataFrame.
    Если данные не масштабированы — применяет MinMaxScaler.
    """
    method = method.lower()
    cluster_col = cluster_col or f"cluster_{method}"

    # Извлекаем признаки
    X = df[features].values

    # Авто-масштабирование
    if not is_minmax_scaled(X):
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    # Кластеризация
    if method == "kmeans":
        model = KMeans(**kwargs)
        labels = model.fit_predict(X)

    elif method == "dbscan":
        model = DBSCAN(**kwargs)
        labels = model.fit_predict(X)

    elif method == "agglomerative":
        model = AgglomerativeClustering(**kwargs)
        labels = model.fit_predict(X)

    elif method == "gmm":
        model = GaussianMixture(**kwargs)
        labels = model.fit_predict(X)

    elif method == "birch":
        model = Birch(**kwargs)
        labels = model.fit_predict(X)

    elif method == "hdbscan":
        import hdbscan
        model = hdbscan.HDBSCAN(**kwargs)
        labels = model.fit_predict(X)

    elif method == "spectral":
        model = SpectralClustering(**kwargs)
        labels = model.fit_predict(X)

    elif method == "minibatch_kmeans":
        model = MiniBatchKMeans(**kwargs)
        labels = model.fit_predict(X)

    elif method == "som":
        from minisom import MiniSom
        som_x = kwargs.get("x", 4)
        som_y = kwargs.get("y", 4)
        sigma = kwargs.get("sigma", 1.0)
        learning_rate = kwargs.get("learning_rate", 0.5)
        som = MiniSom(som_x, som_y, X.shape[1], sigma=sigma, learning_rate=learning_rate)
        som.random_weights_init(X)
        som.train_random(X, 100)
        labels = [som.winner(x)[0] * som_y + som.winner(x)[1] for x in X]

    else:
        raise ValueError(f"Unknown clustering method: {method}")

    # Добавляем столбец кластеров
    df = df.copy()
    df[cluster_col] = labels
    return df


# Safe cluster

In [1]:
PATH = 'data/ksa/pre/train_20250624_144503.parquet'
SAVE_DIR ='data_with_cluster/ksa'

In [2]:
import pandas as pd 

df =pd.read_parquet(PATH)

df.describe()

Unnamed: 0,loantenure,principalamount,lengthofservicewithcurrentemployer,basicsalary,grosssalary,netincome,noofclosedloans,disclosedexpenses,flate_rate,housingallowance,...,simahscore,food_inflation_lag1,consumer_price_index_cpi,interest_rate_mean_lag1_w3,dayspastdue,target_90,target,is_train,gender_tme,maritalstatus_tme
count,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,...,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0,82793.0
mean,0.199793,0.177556,0.071779,0.016418,0.017861,0.018183,0.018571,0.002727,0.265508,0.019975,...,0.514624,0.224511,0.8097,0.552024,0.052032,0.208109,0.208109,0.150061,0.204563,0.204694
std,0.123228,0.129193,0.118611,0.014613,0.016403,0.016884,0.048138,0.004627,0.12811,0.030993,...,0.142507,0.154921,0.135447,0.374815,0.10672,0.405958,0.405958,0.357133,0.046397,0.054662
min,0.0,0.0,0.0,0.0,-0.000526,-0.000576,0.0,0.0,0.0,0.0,...,0.0,0.0,0.45098,0.0,0.0,0.0,0.0,0.0,0.180306,0.124286
25%,0.084746,0.083041,0.006012,0.009621,0.008359,0.008191,0.0,2e-06,0.154158,0.0,...,0.41527,0.099656,0.735294,0.15,0.0,0.0,0.0,0.0,0.180306,0.124286
50%,0.186441,0.13009,0.022044,0.011201,0.012484,0.013129,0.0,0.00116,0.263692,0.016461,...,0.538175,0.230928,0.843954,0.6,0.0,0.0,0.0,0.0,0.180306,0.241853
75%,0.288136,0.198397,0.076152,0.018303,0.021016,0.021441,0.0,0.00381,0.395538,0.025,...,0.625698,0.308591,0.929739,0.916667,0.027848,0.0,0.0,0.0,0.180306,0.241853
max,1.0,1.442072,1.0,1.0,1.0,1.0,1.0,0.487555,1.0,1.0,...,1.024209,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.293306,0.241853


In [3]:
df.columns

Index(['gender', 'loantenure', 'principalamount',
       'lengthofservicewithcurrentemployer', 'basicsalary', 'grosssalary',
       'netincome', 'maritalstatus', 'noofclosedloans', 'disclosedexpenses',
       'flate_rate', 'housingallowance', 'otherallowances', 'simahscore',
       'food_inflation_lag1', 'consumer_price_index_cpi',
       'interest_rate_mean_lag1_w3', 'dayspastdue', 'target_90', 'target',
       'is_train', 'gender_tme', 'maritalstatus_tme'],
      dtype='object')

In [4]:
feature_list = [
    'principalamount',
    'lengthofservicewithcurrentemployer',
    'otherallowances',
    'basicsalary',
    'grosssalary',
    'netincome',
    'disclosedexpenses',
    'housingallowance',
    'food_inflation_lag1',
    'interest_rate_mean_lag1_w3',
    'loantenure',
    'consumer_price_index_cpi',
    'gender_tme',
    'maritalstatus_tme'
]


In [5]:
df[feature_list].info()

<class 'pandas.core.frame.DataFrame'>
Index: 82793 entries, 0 to 84645
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   principalamount                     82793 non-null  float64
 1   lengthofservicewithcurrentemployer  82793 non-null  float64
 2   otherallowances                     82793 non-null  float64
 3   basicsalary                         82793 non-null  float64
 4   grosssalary                         82793 non-null  float64
 5   netincome                           82793 non-null  float64
 6   disclosedexpenses                   82793 non-null  float64
 7   housingallowance                    82793 non-null  float64
 8   food_inflation_lag1                 82793 non-null  float64
 9   interest_rate_mean_lag1_w3          82793 non-null  float64
 10  loantenure                          82793 non-null  float64
 11  consumer_price_index_cpi            82793 non-

In [None]:
import pandas as pd

df = pd.read_parquet(PATH)
features = feature_list

df_kmeans = add_clusters(df, features, method="kmeans", n_clusters=4)
df_kmeans.to_parquet(f"{SAVE_DIR}/df_kmeans.parquet", index=False)

df_dbscan = add_clusters(df, features, method="dbscan", eps=0.3, min_samples=5)
df_dbscan.to_parquet(f"{SAVE_DIR}/df_dbscan.parquet", index=False)

df_agglomerative = add_clusters(df, features, method="agglomerative", n_clusters=4, linkage="ward")
df_agglomerative.to_parquet(f"{SAVE_DIR}/df_agglomerative.parquet", index=False)

df_gmm = add_clusters(df, features, method="gmm", n_components=4, covariance_type="full")
df_gmm.to_parquet(f"{SAVE_DIR}/df_gmm.parquet", index=False)

df_birch = add_clusters(df, features, method="birch", n_clusters=4, threshold=0.5)
df_birch.to_parquet(f"{SAVE_DIR}/df_birch.parquet", index=False)

df_hdbscan = add_clusters(df, features, method="hdbscan", min_cluster_size=10)
df_hdbscan.to_parquet(f"{SAVE_DIR}/df_hdbscan.parquet", index=False)

df_spectral = add_clusters(df, features, method="spectral", n_clusters=4, affinity="nearest_neighbors")
df_spectral.to_parquet(f"{SAVE_DIR}/df_spectral.parquet", index=False)

df_minibatch = add_clusters(df, features, method="minibatch_kmeans", n_clusters=4, batch_size=100)
df_minibatch.to_parquet(f"{SAVE_DIR}/df_minibatch_kmeans.parquet", index=False)

df_som = add_clusters(df, features, method="som", x=4, y=4)
df_som.to_parquet(f"{SAVE_DIR}/df_som.parquet", index=False)
