In [2]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MiniBatchKMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import hdbscan
from minisom import MiniSom
import numpy as np

# Create clster

In [17]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def is_minmax_scaled(X: np.ndarray, eps: float = 1e-3) -> bool:
    """
    Проверяет, находятся ли все значения массива X в диапазоне [0 - eps, 1 + eps]
    """
    return np.all(X >= (0 - eps)) and np.all(X <= (1 + eps))


def add_clusters(df: pd.DataFrame, features: list[str], method: str, cluster_col: str = None, **kwargs) -> pd.DataFrame:
    """
    Добавляет кластерные метки в DataFrame.
    Если данные не масштабированы — применяет MinMaxScaler.
    """
    method = method.lower()
    cluster_col = cluster_col or f"cluster_{method}"

    # Извлекаем признаки
    X = df[features].values

    # Авто-масштабирование
    if not is_minmax_scaled(X):
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    # Кластеризация
    if method == "kmeans":
        model = KMeans(**kwargs)
        labels = model.fit_predict(X)

    elif method == "dbscan":
        model = DBSCAN(**kwargs)
        labels = model.fit_predict(X)

    elif method == "agglomerative":
        model = AgglomerativeClustering(**kwargs)
        labels = model.fit_predict(X)

    elif method == "gmm":
        model = GaussianMixture(**kwargs)
        labels = model.fit_predict(X)

    elif method == "birch":
        model = Birch(**kwargs)
        labels = model.fit_predict(X)

    elif method == "hdbscan":
        import hdbscan
        model = hdbscan.HDBSCAN(**kwargs)
        labels = model.fit_predict(X)

    elif method == "spectral":
        model = SpectralClustering(**kwargs)
        labels = model.fit_predict(X)

    elif method == "minibatch_kmeans":
        model = MiniBatchKMeans(**kwargs)
        labels = model.fit_predict(X)

    elif method == "som":
        from minisom import MiniSom
        som_x = kwargs.get("x", 4)
        som_y = kwargs.get("y", 4)
        sigma = kwargs.get("sigma", 1.0)
        learning_rate = kwargs.get("learning_rate", 0.5)
        som = MiniSom(som_x, som_y, X.shape[1], sigma=sigma, learning_rate=learning_rate)
        som.random_weights_init(X)
        som.train_random(X, 100)
        labels = [som.winner(x)[0] * som_y + som.winner(x)[1] for x in X]

    else:
        raise ValueError(f"Unknown clustering method: {method}")

    # Добавляем столбец кластеров
    df = df.copy()
    df[cluster_col] = labels
    return df


# Safe cluster

In [None]:
PATH = '/content/data_pre_clster/data/DRC/pre/train_20250625_102539.parquet'
SAVE_DIR ='/content/data_pre_clster/data_with_cluster/DRC'

In [27]:
import pandas as pd 

df =pd.read_parquet(PATH)

df.describe()

Unnamed: 0,target,is_train,amount,duration,int_rate,credit_history_count,previous_loan_count,previous_loan_amount,previous_loan_duration,previous_loan_cum_days,time_difference_from_last_loan,currency_tme,product_type_tme,branch_tme,gender_tme,marital_status_tme,education_level_tme,is_collateral_tme
count,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0,105770.0
mean,0.0922,0.849995,0.05,0.228466,0.937388,0.137157,0.063035,0.037927,0.206301,0.022706,0.169247,0.000359,0.01087,0.260917,0.404122,0.227335,0.22107,0.38215
std,0.28931,0.357078,0.078926,0.153899,0.05565,0.089528,0.083791,0.049551,0.127403,0.048922,0.112593,0.018951,0.056569,0.133211,0.490724,0.260201,0.110104,0.485915
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.013428,0.157895,0.916129,0.071429,0.0,0.011427,0.157895,0.0,0.085359,0.0,0.0,0.17878,0.0,0.145995,0.175163,0.0
50%,0.0,1.0,0.027436,0.157895,0.935484,0.107143,0.058824,0.023434,0.157895,0.00627,0.154713,0.0,0.0,0.245241,0.0,0.145995,0.213946,0.0
75%,0.0,1.0,0.059454,0.263158,0.967742,0.178571,0.117647,0.043445,0.263158,0.025078,0.218731,0.0,0.0,0.323299,1.0,0.145995,0.213946,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.058824,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
df.columns

Index(['target', 'is_train', 'amount', 'duration', 'int_rate',
       'credit_history_count', 'previous_loan_count', 'previous_loan_amount',
       'previous_loan_duration', 'previous_loan_cum_days',
       'time_difference_from_last_loan', 'currency_tme', 'product_type_tme',
       'branch_tme', 'gender_tme', 'marital_status_tme', 'education_level_tme',
       'is_collateral_tme'],
      dtype='object')

In [28]:
feature_list = [
       'amount', 'duration', 'int_rate',
       'credit_history_count', 'previous_loan_count', 'previous_loan_amount',
       'previous_loan_duration', 'previous_loan_cum_days',
       'time_difference_from_last_loan', 'currency_tme', 'product_type_tme',
       'branch_tme', 'gender_tme', 'marital_status_tme', 'education_level_tme',
       'is_collateral_tme'
    ]


In [29]:
df[feature_list].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105770 entries, 0 to 105769
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   amount                          105770 non-null  float64
 1   duration                        105770 non-null  float64
 2   int_rate                        105770 non-null  float64
 3   credit_history_count            105770 non-null  float64
 4   previous_loan_count             105770 non-null  float64
 5   previous_loan_amount            105770 non-null  float64
 6   previous_loan_duration          105770 non-null  float64
 7   previous_loan_cum_days          105770 non-null  float64
 8   time_difference_from_last_loan  105770 non-null  float64
 9   currency_tme                    105770 non-null  float64
 10  product_type_tme                105770 non-null  float64
 11  branch_tme                      105770 non-null  float64
 12  gender_tme      

In [None]:
import pandas as pd

df = pd.read_parquet(PATH)
features = feature_list

df_kmeans = add_clusters(df, features, method="kmeans", n_clusters=4)
df_kmeans.to_parquet(f"{SAVE_DIR}/df_kmeans.parquet", index=False)

df_dbscan = add_clusters(df, features, method="dbscan", eps=0.3, min_samples=5)
df_dbscan.to_parquet(f"{SAVE_DIR}/df_dbscan.parquet", index=False)

df_agglomerative = add_clusters(df, features, method="agglomerative", n_clusters=4, linkage="ward")
df_agglomerative.to_parquet(f"{SAVE_DIR}/df_agglomerative.parquet", index=False)

df_gmm = add_clusters(df, features, method="gmm", n_components=4, covariance_type="full")
df_gmm.to_parquet(f"{SAVE_DIR}/df_gmm.parquet", index=False)

df_birch = add_clusters(df, features, method="birch", n_clusters=4, threshold=0.5)
df_birch.to_parquet(f"{SAVE_DIR}/df_birch.parquet", index=False)

df_hdbscan = add_clusters(df, features, method="hdbscan", min_cluster_size=10)
df_hdbscan.to_parquet(f"{SAVE_DIR}/df_hdbscan.parquet", index=False)

df_spectral = add_clusters(df, features, method="spectral", n_clusters=4, affinity="nearest_neighbors")
df_spectral.to_parquet(f"{SAVE_DIR}/df_spectral.parquet", index=False)

df_minibatch = add_clusters(df, features, method="minibatch_kmeans", n_clusters=4, batch_size=100)
df_minibatch.to_parquet(f"{SAVE_DIR}/df_minibatch_kmeans.parquet", index=False)

df_som = add_clusters(df, features, method="som", x=4, y=4)
df_som.to_parquet(f"{SAVE_DIR}/df_som.parquet", index=False)
