# [Clusterization tesk](https://www.kaggle.com/competitions/clustering-physical-activity-data/)

In [54]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm

In [55]:
df = pd.read_parquet("data.parquet")

In [56]:
# Кастомный трансформер для масштабирования внутри групп пользователей
class GroupRobustScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scalers = {}

    def fit(self, X, y=None):
        self.scalers = {}
        groups = X['subject_id']
        features = X.drop(columns=['subject_id', 'timestamp'])
        
        for group in tqdm(groups.unique(), desc='Scaling groups'):
            group_features = features[groups == group]
            scaler = RobustScaler().fit(group_features)
            self.scalers[group] = scaler
        return self

    def transform(self, X):
        groups = X['subject_id']
        features = X.drop(columns=['subject_id', 'timestamp'])
        transformed = features.copy()
        
        for group in tqdm(groups.unique(), desc='Transforming groups'):
            mask = (groups == group)
            scaler = self.scalers.get(group)
            if scaler:
                transformed.loc[mask] = scaler.transform(features[mask])
        return transformed

In [57]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Создание и выполнение пайплайна
def process_data(df):
    pipeline = Pipeline([
        ('group_scaler', GroupRobustScaler()),
        ('pca', PCA(n_components=0.95)),
        ('cluster', KMeans(n_clusters=5, verbose=1))
    ])

    with tqdm(total=3, desc='Processing pipeline') as pbar:
        pipeline.fit(df)
        pbar.update(1)
        
        processed_data = pipeline['group_scaler'].transform(df)
        pbar.update(1)
        
        pca_data = pipeline['pca'].transform(processed_data)
        pbar.update(1)
    
    return pipeline, pca_data

In [58]:
# Обработка данных
pipeline, pca_data = process_data(df)

# Получение меток и экспорт
labels = pipeline.named_steps['cluster'].labels_
result_df = pd.DataFrame({
    'Index': df.index,
    'activityID': labels
})

Scaling groups: 100%|██████████| 8/8 [00:00<00:00, 13.63it/s]
Transforming groups: 100%|██████████| 8/8 [00:00<00:00, 42.41it/s]
Processing pipeline:   0%|          | 0/3 [00:00<?, ?it/s]


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
def normalize_cluster_labels(cluster_series, column_name):
    cluster_series[column_name] = pd.Series(pd.factorize(cluster_series[column_name])[0] + 1)
    return cluster_series

result_df = normalize_cluster_labels(result_df, "activityID")
result_df.to_csv("activity_clusters.csv", index=False)

In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Загрузка данных с обработкой NaN
df = pd.read_parquet("data.parquet").dropna(subset=['subject_id'])

class GroupRobustScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scalers = {}
        self.group_imputer = SimpleImputer(strategy='median')

    def fit(self, X, y=None):
        self.scalers = {}
        groups = X['subject_id']
        features = X.drop(columns=['subject_id', 'timestamp'])
        
        # Импутация и масштабирование для каждой группы
        for group in tqdm(groups.unique(), desc='Scaling groups'):
            mask = (groups == group)
            group_data = self.group_imputer.fit_transform(features[mask])
            scaler = RobustScaler().fit(group_data)
            self.scalers[group] = (scaler, self.group_imputer)
        return self

    def transform(self, X):
        groups = X['subject_id']
        features = X.drop(columns=['subject_id', 'timestamp'])
        transformed = features.copy()
        
        for group in tqdm(groups.unique(), desc='Transforming groups'):
            mask = (groups == group)
            if group in self.scalers:
                scaler, imputer = self.scalers[group]
                imputed_data = imputer.transform(features[mask])
                scaled_data = scaler.transform(imputed_data)
                transformed.loc[mask] = scaled_data
        return transformed.dropna()

def process_data(df):
    pipeline = Pipeline([
        ('group_scaler', GroupRobustScaler()),
        ('final_imputer', SimpleImputer(strategy='median')),
        ('pca', PCA(n_components=0.95, random_state=42)),
        ('cluster', KMeans(n_clusters=5, n_init='auto', random_state=42))
    ])

    with tqdm(total=4, desc='Processing pipeline') as pbar:
        # Полная обработка данных
        clean_df = df.dropna(subset=df.columns.difference(['subject_id', 'timestamp']))
        pipeline.fit(clean_df)
        pbar.update(1)
        
        # Получение преобразованных данных
        processed_data = pipeline[:-1].transform(clean_df)
        pbar.update(1)
        
        # Проверка на NaN
        assert not np.isnan(processed_data).any(), "NaN values detected after transformation"
        pbar.update(1)
        
        # Кластеризация
        labels = pipeline.named_steps['cluster'].fit_predict(processed_data)
        pbar.update(1)
    
    return pipeline, processed_data, labels

# Измененная секция обработки данных и экспорта
try:
    pipeline, pca_data, labels = process_data(df)
except Exception as e:
    print(f"Error in processing: {str(e)}")
    raise

# Получаем индексы из очищенных данных
clean_df = df.dropna(subset=df.columns.difference(['subject_id', 'timestamp']))
filtered_index = clean_df.index

# Нормализация меток кластеров с сохранением индексов
def normalize_cluster_labels(labels, index):
    return pd.Series(
        pd.factorize(labels)[0] + 1,
        index=index,
        name='activityID'
    )

# Создаем результирующий DataFrame только с валидными индексами
result_df = pd.DataFrame({
    'Index': filtered_index,
    'activityID': normalize_cluster_labels(labels, filtered_index)
})

# Экспорт результатов
result_df.to_csv("activity_clusters.csv", index=False)


'''
# Оптимизация кластеров с обработкой ошибок
def optimize_clusters(data):
    scores = []
    for k in tqdm(range(2, 11), desc='Optimizing clusters'):
        try:
            kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
            labels = kmeans.fit_predict(data)
            scores.append(silhouette_score(data, labels))
        except Exception as e:
            print(f"Error at k={k}: {str(e)}")
            scores.append(-1)
    return np.argmax(scores) + 2

if not np.isnan(pca_data).any():
    optimal_k = optimize_clusters(pca_data)
    print(f"Optimal clusters: {optimal_k}")
else:
    print("Cannot optimize clusters due to NaN values")'''

Scaling groups: 100%|██████████| 8/8 [00:01<00:00,  4.93it/s]
Transforming groups: 100%|██████████| 8/8 [00:00<00:00, 29.09it/s]
Transforming groups: 100%|██████████| 8/8 [00:00<00:00, 34.82it/s]
Processing pipeline: 100%|██████████| 4/4 [00:04<00:00,  1.04s/it]


'\n# Оптимизация кластеров с обработкой ошибок\ndef optimize_clusters(data):\n    scores = []\n    for k in tqdm(range(2, 11), desc=\'Optimizing clusters\'):\n        try:\n            kmeans = KMeans(n_clusters=k, n_init=\'auto\', random_state=42)\n            labels = kmeans.fit_predict(data)\n            scores.append(silhouette_score(data, labels))\n        except Exception as e:\n            print(f"Error at k={k}: {str(e)}")\n            scores.append(-1)\n    return np.argmax(scores) + 2\n\nif not np.isnan(pca_data).any():\n    optimal_k = optimize_clusters(pca_data)\n    print(f"Optimal clusters: {optimal_k}")\nelse:\n    print("Cannot optimize clusters due to NaN values")'