In [17]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

def transform_features(data_path, output_path=None, skew_threshold=1.0,
                       poly_degree=2, poly_threshold=0.1,
                       visualize=True, interaction_terms=True, use_poly=True):
    """
    Automatically transform features based on their characteristics.

    Args:
        data_path (str): Path to the input CSV file
        output_path (str, optional): Path to save the transformed data. If None, will use 'transformed_[original_name].csv'
        skew_threshold (float): Threshold for considering a feature highly skewed
        poly_degree (int): Maximum degree for polynomial features
        poly_threshold (float): Correlation threshold for creating polynomial features
        visualize (bool): Whether to create visualization of original vs. transformed features
        interaction_terms (bool): Whether to create interaction terms between features
        use_poly (bool): Whether to create polynomial features

    Returns:
        pd.DataFrame: Transformed dataframe
    """
    # Set default output path
    if output_path is None:
        output_path = f"transformed_{data_path.split('/')[-1]}"

    # Load the data
    print(f"Loading data from {data_path}...")
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    else:
        df = pd.read_excel(data_path)

    print(f"Original data shape: {df.shape}")

    # Find the target column if it exists
    target_col = None
    if 'group' in df.columns:
        target_col = 'group'
        labels = df[target_col].copy()
        df_features = df.drop(columns=[target_col])
    else:
        df_features = df.copy()

    # Get numeric columns
    numeric_cols = df_features.select_dtypes(include=['number']).columns.tolist()
    print(f"Number of numeric features: {len(numeric_cols)}")

    # Calculate skewness for each feature
    skewness = df_features[numeric_cols].apply(lambda x: stats.skew(x.dropna()))

    # Identify highly skewed features
    highly_skewed = skewness[abs(skewness) > skew_threshold].index.tolist()
    print(f"Number of highly skewed features: {len(highly_skewed)}")

    # Create a new dataframe for transformed features
    transformed_df = pd.DataFrame(index=df_features.index)
    transformation_log = {}

    # Process each feature
    for col in numeric_cols:
        # Get the feature data
        x = df_features[col].values

        # Check if the feature is highly skewed
        if col in highly_skewed:
            # Check if the feature has non-positive values (can't apply log directly)
            if np.min(x) <= 0:
                # Shift data to make it positive
                shift = abs(np.min(x)) + 1.0
                x_transformed = np.log1p(x + shift)
                transformation_log[col] = f"log1p(x + {shift})"
            else:
                # Apply log transformation
                x_transformed = np.log1p(x)
                transformation_log[col] = "log1p(x)"

            # Add the transformed feature
            transformed_df[f"{col}"] = x_transformed
        else:
            # Keep the original feature
            transformed_df[f"{col}"] = x
            transformation_log[col] = "original"

    # Only create polynomial features if use_poly is True
    if use_poly:
        # Create polynomial features for a subset of important features
        # First, identify important features using correlation with target or variance
        if target_col is not None:
            # Use correlation with target to find important features
            important_features = []
            for col in transformed_df.columns:
                if abs(np.corrcoef(transformed_df[col], pd.get_dummies(labels).iloc[:, 0])[0, 1]) > poly_threshold:
                    important_features.append(col)
        else:
            # Use variance as a measure of importance
            variances = transformed_df.var().sort_values(ascending=False)
            important_features = variances.index[:int(len(transformed_df.columns) * poly_threshold)].tolist()

        print(f"Number of features selected for polynomial transformation: {len(important_features)}")

        # Generate polynomial features
        for col in important_features:
            x = transformed_df[col].values
            for degree in range(2, poly_degree + 1):
                transformed_df[f"{col}_pow{degree}"] = x ** degree
                transformation_log[f"{col}_pow{degree}"] = f"{col}^{degree}"

        # Create interaction terms between important features if requested
        if interaction_terms and len(important_features) >= 2:
            print("Generating interaction terms...")
            for i in range(len(important_features)):
                for j in range(i+1, len(important_features)):
                    col1, col2 = important_features[i], important_features[j]
                    new_col = f"{col1}_mul_{col2}"
                    transformed_df[new_col] = transformed_df[col1] * transformed_df[col2]
                    transformation_log[new_col] = f"{col1} * {col2}"
    
    # Standardize all features
    print("Standardizing features...")
    for col in transformed_df.columns:
        mean = transformed_df[col].mean()
        std = transformed_df[col].std()
        if std > 0:  # Avoid division by zero
            transformed_df[col] = (transformed_df[col] - mean) / std

    # Add back the target column if it exists
    if target_col is not None:
        transformed_df[target_col] = labels

    # Save the transformed data
    transformed_df.to_csv(output_path, index=False)
    print(f"Transformed data saved to {output_path}")
    print(f"Final data shape: {transformed_df.shape}")

    # Print transformation summary
    print("\nTransformation Summary:")
    for col, transform in transformation_log.items():
        print(f"{col}: {transform}")

    # Create visualizations if requested
    if visualize:
        sample_cols = min(5, len(highly_skewed))
        if sample_cols > 0:
            plt.figure(figsize=(15, 3 * sample_cols))
            for i, col in enumerate(highly_skewed[:sample_cols]):
                # Original distribution
                plt.subplot(sample_cols, 2, 2*i + 1)
                plt.hist(df_features[col].dropna(), bins=30)
                plt.title(f"Original: {col}")

                # Transformed distribution
                plt.subplot(sample_cols, 2, 2*i + 2)
                plt.hist(transformed_df[col].dropna(), bins=30)
                plt.title(f"Transformed: {col}")

            plt.tight_layout()
            plt.savefig("feature_transformations.png")
            plt.close()
            print("Visualizations saved to 'feature_transformations.png'")

    return transformed_df

# Example usage:
# transformed_data = transform_features("cancer_dataset.csv", skew_threshold=1.0)

class MyPCA:
    def __init__(self, n_components):
        """
        Initialize PCA with the number of components.

        Args:
            n_components (int): Number of principal components to keep
        """
        self.n_components = n_components
        self.components_ = None
        self.mean_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None

    def fit(self, X):
        """
        Fit the PCA model with X.

        Args:
            X (array-like): Training data, shape (n_samples, n_features)

        Returns:
            self: Returns the instance itself
        """
        # Convert to numpy array if it's not
        X = np.array(X)

        # Store dimensions
        n_samples, n_features = X.shape

        # Center the data
        self.mean_ = np.mean(X, axis=0)
        X_centered = X - self.mean_

        # Compute covariance matrix
        cov_matrix = np.dot(X_centered.T, X_centered) / (n_samples - 1)

        # Compute eigenvalues and eigenvectors of covariance matrix
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        # Sort eigenvalues and eigenvectors in decreasing order
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

        # Store components (eigenvectors)
        self.components_ = eigenvectors[:, :self.n_components].T

        # Store eigenvalues
        self.explained_variance_ = eigenvalues[:self.n_components]

        # Calculate explained variance ratio
        self.explained_variance_ratio_ = self.explained_variance_ / np.sum(eigenvalues)

        # Calculate cumulative explained variance
        self.cumulative_explained_variance_ratio_ = np.cumsum(self.explained_variance_ratio_)

        # Store singular values
        self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_)

        return self

    def transform(self, X):
        """
        Apply dimensionality reduction to X.

        Args:
            X (array-like): Data to transform, shape (n_samples, n_features)

        Returns:
            X_new (array-like): Transformed data, shape (n_samples, n_components)
        """
        # Convert to numpy array if it's not
        X = np.array(X)

        # Center the data
        X_centered = X - self.mean_

        # Project the data onto the principal components
        X_transformed = np.dot(X_centered, self.components_.T)

        return X_transformed

    def fit_transform(self, X):
        """
        Fit the model with X and apply the dimensionality reduction on X.

        Args:
            X (array-like): Training data, shape (n_samples, n_features)

        Returns:
            X_new (array-like): Transformed data, shape (n_samples, n_components)
        """
        self.fit(X)
        return self.transform(X)

class MyKMeans:
    def __init__(self, n_clusters=2, n_init=10, max_iter=300, tol=1e-4, random_state=None, track_history=False):
        self.n_clusters = n_clusters            # Số nhóm cần phân
        self.n_init = n_init                    # Số lần chọn lại để ra chọn kết quả tốt nhất
        self.max_iter = max_iter                # Số vòng lặp tối đa
        self.tol = tol                          # Ngưỡng dừng nếu thay đổi quá nhỏ
        self.random_state = random_state        # Giúp chạy lại ra kết quả giống nhau
        self.track_history = track_history      # Có lưu lại lịch sử hay không
        self.centroids = None                   # Các điểm trung tâm cuối cùng
        self.labels_ = None                     # Nhóm mà mỗi điểm dữ liệu thuộc về
        self.inertia_ = None                    # Tổng khoảng cách giữa dữ liệu và trung tâm nhóm
        
        # Theo dõi tiến trình
        self.history_centroids = []             # Lưu các vị trí trung tâm theo từng vòng
        self.history_inertia = []               # Lưu sai số theo từng vòng

    def init_centroids(self, X):
        # Chọn ngẫu nhiên một vài điểm ban đầu làm trung tâm nhóm
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, self.n_clusters, replace=False)
        return X[indices]

    def compute_inertia(self, X, centroids, labels):
        # Tính tổng khoảng cách từ điểm đến trung tâm nhóm của nó
        return np.sum((np.linalg.norm(X - centroids[labels], axis=1)) ** 2)

    def fit(self, X):
        X = np.array(X)
        best_inertia = float('inf')     # Sai số nhỏ nhất từng thấy
        best_centroids = None
        best_labels = None
        best_history_centroids = []
        best_history_inertia = []

        # Giữ cho việc chọn ngẫu nhiên luôn giống nhau nếu đặt trước
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # Thử lại nhiều lần để chọn được cách chia tốt nhất
        for _ in range(self.n_init):
            centroids = self.init_centroids(X)  # Chọn ngẫu nhiên trung tâm ban đầu
            history_centroids = []
            history_inertia = []

            for _ in range(self.max_iter):
                # Bước 1: Gán từng điểm vào nhóm gần nó nhất
                distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
                labels = np.argmin(distances, axis=1)

                # Bước 2: Tính lại trung tâm mỗi nhóm
                new_centroids = np.array([
                    X[labels == i].mean(axis=0) if np.any(labels == i) else X[np.random.choice(X.shape[0])]
                    for i in range(self.n_clusters)
                ])

                # Lưu lại quá trình nếu cần
                if self.track_history:
                    history_centroids.append(centroids.copy())
                    inertia = self.compute_inertia(X, centroids, labels)
                    history_inertia.append(inertia)

                # Kiểm tra xem các trung tâm có thay đổi nhiều không
                if np.all(np.linalg.norm(centroids - new_centroids, axis=1) < self.tol):
                    break

                centroids = new_centroids  # Cập nhật

            # Tính sai số sau khi đã hội tụ
            inertia = self.compute_inertia(X, centroids, labels)

            # Nếu lần này tốt hơn trước, thì lưu lại
            if inertia < best_inertia:
                best_inertia = inertia
                best_centroids = centroids
                best_labels = labels
                best_history_centroids = history_centroids
                best_history_inertia = history_inertia

        # Ghi nhận kết quả tốt nhất
        self.centroids = best_centroids
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        if self.track_history:
            self.history_centroids = best_history_centroids
            self.history_inertia = best_history_inertia

        return self

    def predict(self, X):
        X = np.array(X)
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X):
        self.fit(X)
        return self.labels_

class MyGMM:
    def __init__(self, n_components=2, max_iter=100, tol=1e-4, random_state=None):
        # Khởi tạo tham số
        self.n_components = n_components        # Số cụm (số thành phần Gaussian)
        self.max_iter = max_iter                
        self.tol = tol                          # Ngưỡng hội tụ (tolerance)
        self.random_state = random_state        # Seed cho reproducibility
        self.means_ = None                      # Trung bình của các Gaussian
        self.covariances_ = None                # Ma trận hiệp phương sai của các Gaussian
        self.weights_ = None                    # Trọng số (xác suất) của các Gaussian
        self.labels_ = None                     # Nhãn dự đoán sau khi phân cụm
        self.reg_covar = 1e-6                   # Regularization thêm vào để tránh ma trận suy biến

    def _initialize(self, X):
        # Khởi tạo ban đầu các tham số
        n_samples, n_features = X.shape
        
        # Đảm bảo reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        # Sử dụng K-means để khởi tạo tốt hơn
        kmeans = MyKMeans(n_clusters=self.n_components, n_init=1, random_state=self.random_state)
        kmeans.fit(X)
        self.means_ = kmeans.centroids
        
        # Khởi tạo ma trận hiệp phương sai cho mỗi cụm
        self.covariances_ = np.zeros((self.n_components, n_features, n_features))
        for k in range(self.n_components):
            mask = (kmeans.labels_ == k)
            if np.any(mask):
                # Nếu cụm có điểm dữ liệu, tính toán hiệp phương sai
                X_k = X[mask]
                self.covariances_[k] = np.cov(X_k, rowvar=False) + self.reg_covar * np.eye(n_features)
            else:
                # Nếu cụm không có điểm dữ liệu, sử dụng ma trận đơn vị
                self.covariances_[k] = np.eye(n_features)
        
        # Khởi tạo trọng số dựa trên số lượng điểm trong mỗi cụm
        counts = np.bincount(kmeans.labels_, minlength=self.n_components)
        self.weights_ = counts / n_samples
        # Tránh trọng số bằng 0
        self.weights_[self.weights_ < 1e-10] = 1e-10
        self.weights_ = self.weights_ / np.sum(self.weights_)

    def _e_step(self, X):
        # Bước E: Tính trách nhiệm (responsibilities)
        n_samples = X.shape[0]
        resp = np.zeros((n_samples, self.n_components))
        
        # Tính xác suất cho từng điểm dữ liệu thuộc về từng Gaussian
        for k in range(self.n_components):
            resp[:, k] = self.weights_[k] * self._gaussian(X, self.means_[k], self.covariances_[k])
        
        # Chuẩn hóa để tổng mỗi hàng bằng 1 (soft assignment)
        # và xử lý trường hợp tổng bằng 0
        resp_sum = resp.sum(axis=1, keepdims=True)
        resp = np.divide(resp, resp_sum, out=np.full_like(resp, 1.0/self.n_components), where=resp_sum>1e-15)
        return resp

    def _m_step(self, X, resp):
        # Bước M: Cập nhật tham số mô hình
        n_samples, n_features = X.shape
        Nk = resp.sum(axis=0)  # Tổng trọng số (responsibility) cho từng cụm
        
        # Cập nhật trọng số với regularization để tránh trọng số = 0
        self.weights_ = (Nk + 1e-10) / (n_samples + self.n_components * 1e-10)
        
        # Cập nhật trung bình mới cho mỗi cụm
        self.means_ = np.zeros((self.n_components, n_features))
        for k in range(self.n_components):
            if Nk[k] > 1e-10:
                self.means_[k] = (resp[:, k, np.newaxis] * X).sum(axis=0) / Nk[k]
            else:
                # Nếu cụm không có điểm, giữ nguyên trung tâm cũ hoặc tạo mới
                self.means_[k] = X[np.random.choice(n_samples)]
        
        # Cập nhật ma trận hiệp phương sai
        self.covariances_ = np.zeros((self.n_components, n_features, n_features))
        for k in range(self.n_components):
            diff = X - self.means_[k]
            if Nk[k] > 1e-10:
                # Tính ma trận hiệp phương sai weighted
                cov_k = (resp[:, k, np.newaxis, np.newaxis] * 
                        np.einsum('ij,ik->ijk', diff, diff)).sum(axis=0) / Nk[k]
            else:
                # Nếu cụm không có điểm, sử dụng ma trận đơn vị
                cov_k = np.eye(n_features)
                
            # Thêm regularization để đảm bảo ma trận khả nghịch
            self.covariances_[k] = cov_k + self.reg_covar * np.eye(n_features)

    def _gaussian(self, X, mean, cov):
        """Tính giá trị phân phối Gaussian đa biến với xử lý tốt hơn"""
        n_samples, n_features = X.shape
        
        try:
            # Sử dụng phân hủy Cholesky để tính determinant và inverse
            # Cách này giúp tránh được việc cần tính trực tiếp ma trận nghịch đảo
            chol = np.linalg.cholesky(cov)
            
            # Tính log(det(cov)) từ cholesky: log(det(cov)) = 2 * sum(log(diag(chol)))
            log_det = 2 * np.sum(np.log(np.diag(chol)))
            
            # Tính (x-mean)^T * cov^-1 * (x-mean) bằng cách giải hệ phương trình
            diff = X - mean
            alpha = np.linalg.solve(chol, diff.T)
            mahalanobis = np.sum(alpha**2, axis=0)
            
            # Tính log-probability
            log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
            return np.exp(log_prob)
            
        except np.linalg.LinAlgError:
            # Nếu phân hủy Cholesky thất bại (ma trận không khả định), 
            # sử dụng cách tiếp cận khác
            # Thêm regularization
            reg_cov = cov + self.reg_covar * np.eye(n_features)
            
            # Tính determinant và inverse
            det = np.linalg.det(reg_cov)
            inv_cov = np.linalg.inv(reg_cov)
            
            # Tránh underflow khi det quá nhỏ
            if det < 1e-15:
                det = 1e-15
                
            # Tính mũ cho hàm Gaussian
            norm = 1. / np.sqrt((2 * np.pi) ** n_features * det)
            diff = X - mean
            
            # Thay vì tính toán trực tiếp ma trận, sử dụng einsum để tối ưu
            # Tính (x-mean)^T * inv_cov * (x-mean) cho mỗi điểm
            mahalanobis = np.einsum('ij,jk,ik->i', diff, inv_cov, diff)
            
            # Tính xác suất
            return norm * np.exp(-0.5 * mahalanobis)

    def fit(self, X):
        # Huấn luyện mô hình GMM
        X = np.array(X)
        self._initialize(X)
        log_likelihood_old = -np.inf
        
        # Vòng lặp EM
        for iteration in range(self.max_iter):
            # Bước E: Tính trách nhiệm (responsibilities)
            resp = self._e_step(X)
            
            # Bước M: Cập nhật tham số
            self._m_step(X, resp)
            
            # Tính log-likelihood mới
            log_likelihood = 0
            for i in range(X.shape[0]):
                likelihood_sample = 0
                for k in range(self.n_components):
                    likelihood_sample += self.weights_[k] * self._gaussian(X[i:i+1], self.means_[k], self.covariances_[k])
                log_likelihood += np.log(max(likelihood_sample, 1e-15))
            
            # Kiểm tra hội tụ
            if abs(log_likelihood - log_likelihood_old) < self.tol:
                break
                
            log_likelihood_old = log_likelihood
        
        # Gán nhãn (label) cho từng điểm dữ liệu
        self.labels_ = self.predict(X)
        return self

    def predict(self, X):
        """Dự đoán nhãn cụm cho dữ liệu mới"""
        X = np.array(X)
        resp = np.zeros((X.shape[0], self.n_components))
        
        for k in range(self.n_components):
            resp[:, k] = self.weights_[k] * self._gaussian(X, self.means_[k], self.covariances_[k])
            
        # Tránh việc tất cả các xác suất đều bằng 0
        if np.all(resp == 0, axis=1).any():
            # Tính khoảng cách Euclidean cho các điểm có xác suất = 0
            zero_prob_indices = np.where(np.all(resp == 0, axis=1))[0]
            for idx in zero_prob_indices:
                # Gán nhãn dựa trên khoảng cách Euclidean gần nhất
                distances = np.linalg.norm(X[idx] - self.means_, axis=1)
                resp[idx, np.argmin(distances)] = 1.0
                
        return np.argmax(resp, axis=1)

    def fit_predict(self, X):
        """Huấn luyện và trả về nhãn cụm"""
        self.fit(X)
        return self.labels_

def run_experiment(dataset_path, output_dir='./', skew_threshold=1.0,
                  component_range=None, k=2, visualize=True, algorithm='KMeans', use_poly=True):
    """
    Run a complete experiment with feature transformation, PCA, and clustering.

    Args:
        dataset_path (str): Path to the input dataset CSV
        output_dir (str): Directory to save outputs
        skew_threshold (float): Threshold for considering a feature highly skewed
        component_range (list): List of n_components values to test
        k (int): Number of clusters
        visualize (bool): Whether to create visualizations
    """
    import os
    from datetime import datetime

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate a timestamp for the experiment
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(output_dir, f"experiment_{timestamp}")
    os.makedirs(experiment_dir, exist_ok=True)

    # Define file paths
    transformed_path = os.path.join(experiment_dir, "transformed_data.csv")
    results_path = os.path.join(experiment_dir, "experiment_results.csv")
    log_path = os.path.join(experiment_dir, "experiment_log.txt")

    # Set up logging
    with open(log_path, 'w') as log_file:
        def log(message):
            print(message)
            log_file.write(message + '\n')
            log_file.flush()

        log(f"=== Experiment started at {timestamp} ===")
        log(f"Dataset: {dataset_path}")

        # Load and transform features
        log("\n--- Feature Transformation ---")
        transformed_data = transform_features(
            dataset_path,
            output_path=transformed_path,
            skew_threshold=skew_threshold,
            visualize=visualize,
            use_poly=use_poly
        )

        # Extract features and target
        if 'group' in transformed_data.columns:
            y = transformed_data['group'].copy()
            X = transformed_data.drop(columns=['group'])
        else:
            y = None
            X = transformed_data

        log(f"Transformed data shape: {X.shape}")

        # Determine component range if not provided
        if component_range is None:
            max_components = min(X.shape[0], X.shape[1])
            component_range = [10, 20, 30, 50, 70, 100, 150]
            component_range = [c for c in component_range if c <= max_components]

        # Prepare results storage
        results = []

        # Run PCA with different n_components
        log("\n--- PCA and Clustering Experiments ---")
        for n_components in component_range:
            log(f"\nTesting with n_components={n_components}")

            # Perform PCA
            pca = MyPCA(n_components=n_components)
            X_pca = pca.fit_transform(X)

            # Record basic PCA stats
            evr = pca.explained_variance_ratio_.sum()
            log(f"Explained variance ratio: {evr:.4f}")

            if algorithm == 'KMeans':
                # Implement KMeans clustering
                kmeans = MyKMeans(n_clusters=k, max_iter=100, random_state=42, track_history=True)
                kmeans.fit(X_pca)

                # Lấy kết quả
                labels = kmeans.labels_
                centroids = kmeans.centroids
            elif algorithm == "GMM":
                gmm = MyGMM(n_components=k, max_iter=100, random_state=42)
                gmm.fit(X_pca)

                # Lấy kết quả
                labels = gmm.labels_

            # Evaluate clustering if we have true labels
            if y is not None:
                # Convert labels to numeric if they're categorical
                if not pd.api.types.is_numeric_dtype(y):
                    label_map = {label: i for i, label in enumerate(y.unique())}
                    y_numeric = y.map(label_map)
                else:
                    y_numeric = y

                # Calculate accuracy (after finding best label mapping)
                from scipy.optimize import linear_sum_assignment

                # Create confusion matrix
                conf_matrix = np.zeros((k, k))
                for i in range(len(labels)):
                    conf_matrix[labels[i], y_numeric.iloc[i]] += 1

                # Find optimal assignment
                row_ind, col_ind = linear_sum_assignment(-conf_matrix)

                # Remap cluster labels
                remapped_labels = np.zeros_like(labels)
                for i in range(k):
                    remapped_labels[labels == row_ind[i]] = col_ind[i]

                # Calculate accuracy
                accuracy = np.sum(remapped_labels == y_numeric) / len(y_numeric)
                log(f"Clustering accuracy: {accuracy:.4f}")

                # Calculate F1 score
                from sklearn.metrics import f1_score
                f1 = f1_score(y_numeric, remapped_labels, average='weighted')
                log(f"F1 score: {f1:.4f}")

                # Calculate silhouette score
                from sklearn.metrics import silhouette_score
                if len(np.unique(labels)) > 1:
                    silhouette = silhouette_score(X_pca, labels)
                    log(f"Silhouette score: {silhouette:.4f}")
                else:
                    silhouette = None
                    log("Silhouette score: Không thể tính vì chỉ có 1 cụm được tạo ra.")

                # Store results
                results.append({
                    'n_components': n_components,
                    'explained_variance_ratio': evr,
                    'accuracy': accuracy,
                    'f1_score': f1,
                    'silhouette_score': silhouette
                })

            # Visualize 2D projection for the first experiment
            if visualize and n_components >= 2:
                plt.figure(figsize=(16, 6))

                # Plot clustering results
                plt.subplot(1, 2, 1)
                plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7)
                plt.title(f"Clustering Results (n_components={n_components})")
                plt.xlabel("PC1")
                plt.ylabel("PC2")
                plt.colorbar()

                # Plot true labels if available
                if y is not None:
                    plt.subplot(1, 2, 2)
                    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_numeric, cmap='viridis', alpha=0.7)
                    plt.title("True Labels")
                    plt.xlabel("PC1")
                    plt.ylabel("PC2")
                    plt.colorbar()

                plt.tight_layout()
                plt.savefig(os.path.join(experiment_dir, f"pca_n{n_components}.png"))
                plt.close()

        # Save results to CSV
        if results:
            results_df = pd.DataFrame(results)
            results_df.to_csv(results_path, index=False)
            log(f"\nResults saved to {results_path}")

            # Find the best configuration
            best_accuracy_idx = results_df['accuracy'].idxmax()
            best_config = results_df.iloc[best_accuracy_idx]
            log(f"\nBest configuration:")
            log(f"n_components: {best_config['n_components']}")
            log(f"Accuracy: {best_config['accuracy']:.4f}")
            log(f"F1 score: {best_config['f1_score']:.4f}")
            log(f"Silhouette score: {best_config['silhouette_score']:.4f}")

            # Plot metrics vs n_components
            if visualize and len(component_range) > 1:
                plt.figure(figsize=(15, 10))

                # Plot explained variance vs components
                plt.subplot(2, 2, 1)
                plt.plot(results_df['n_components'], results_df['explained_variance_ratio'], 'o-')
                plt.title('Explained Variance vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Cumulative Explained Variance')

                # Plot accuracy vs components
                plt.subplot(2, 2, 2)
                plt.plot(results_df['n_components'], results_df['accuracy'], 'o-')
                plt.title('Accuracy vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Accuracy')

                # Plot F1 score vs components
                plt.subplot(2, 2, 3)
                plt.plot(results_df['n_components'], results_df['f1_score'], 'o-')
                plt.title('F1 Score vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('F1 Score')

                # Plot silhouette score vs components
                plt.subplot(2, 2, 4)
                plt.plot(results_df['n_components'], results_df['silhouette_score'], 'o-')
                plt.title('Silhouette Score vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Silhouette Score')

                plt.tight_layout()
                plt.savefig(os.path.join(experiment_dir, "metrics_vs_components.png"))
                plt.close()

        log(f"\n=== Experiment completed ===")
        return experiment_dir

In [19]:
# Example usage:
experiment_dir = run_experiment("../Dataset/ABIDE2.csv", skew_threshold=0.75, component_range=[1, 2, 5, 10, 20, 50, 100, 200, 500], algorithm="GMM", use_poly=False)

=== Experiment started at 20250512_011510 ===
Dataset: ../Dataset/ABIDE2.csv

--- Feature Transformation ---
Loading data from ../Dataset/ABIDE2.csv...
Original data shape: (1004, 1445)
Number of numeric features: 1443
Number of highly skewed features: 153


  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  tra

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./experiment_20250512_011510\transformed_data.csv
Final data shape: (1004, 1444)

Transformation Summary:
Unnamed: 0: original
subject: original
age: log1p(x)
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: log1p(x)
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: log1p(x)
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7m_ROI: original
fsArea_L_POS1_RO

In [20]:
import seaborn as sns

def run_multi_threshold_experiment(dataset_path, output_dir='./',
                                 skew_thresholds=[0.5, 0.75, 1.0, 1.25, 1.5],
                                 component_range=[10, 20, 30, 50, 100],
                                 k=2, algorithms=["KMeans", "GMM"],
                                 poly_features=[False, True]):
    """
    Run experiments with multiple skew thresholds, algorithms, and polynomial features option.

    Args:
        dataset_path (str): Path to dataset
        output_dir (str): Output directory
        skew_thresholds (list): List of skew thresholds to test
        component_range (list): List of n_components values
        k (int): Number of clusters
        algorithms (list): List of clustering algorithms to use
        poly_features (list): List of boolean values whether to use polynomial features or not
    """
    import os
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(output_dir, f"multi_threshold_{timestamp}")
    os.makedirs(experiment_dir, exist_ok=True)

    # Store results for all thresholds and algorithms
    all_results = []

    # Run experiments for each threshold, algorithm, and poly_feature option
    for threshold in skew_thresholds:
        for algorithm in algorithms:
            for use_poly in poly_features:
                exp_dir = run_experiment(
                    dataset_path,
                    output_dir=experiment_dir,
                    skew_threshold=threshold,
                    component_range=component_range,
                    k=k,
                    visualize=True,
                    algorithm=algorithm,
                    poly_features=use_poly
                )

                # Read results
                results_df = pd.read_csv(os.path.join(exp_dir, "experiment_results.csv"))
                results_df['skew_threshold'] = threshold
                results_df['algorithm'] = algorithm
                results_df['poly_features'] = use_poly
                all_results.append(results_df)

    # Combine all results
    combined_results = pd.concat(all_results)

    # Save combined results
    combined_results.to_csv(os.path.join(experiment_dir, 'combined_results.csv'), index=False)

    # Create visualizations for each algorithm with polynomial feature comparison
    for algorithm in algorithms:
        alg_results = combined_results[combined_results['algorithm'] == algorithm]
        
        # Visualizations comparing with and without polynomial features
        plt.figure(figsize=(20, 15))
        plt.suptitle(f'Results for {algorithm} - Polynomial Features Comparison', fontsize=16)
        
        # 1. Line plot: Accuracy vs Components for different thresholds and poly options
        ax1 = plt.subplot(221)
        for threshold in skew_thresholds:
            for use_poly in poly_features:
                data = alg_results[(alg_results['skew_threshold'] == threshold) & 
                                  (alg_results['poly_features'] == use_poly)]
                poly_label = "With Poly" if use_poly else "Without Poly"
                linestyle = '-' if use_poly else '--'
                ax1.plot(data['n_components'], data['accuracy'], 
                        marker='o', linestyle=linestyle,
                        label=f'Threshold={threshold}, {poly_label}')
        ax1.set_xlabel('Number of Components')
        ax1.set_ylabel('Accuracy')
        ax1.set_title(f'Accuracy vs Components - {algorithm}')
        ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # 2. Box plot: Accuracy distribution by polynomial features
        ax2 = plt.subplot(222)
        sns.boxplot(data=alg_results, x='poly_features', y='accuracy', ax=ax2)
        ax2.set_xticklabels(['Without Poly', 'With Poly'])
        ax2.set_title(f'Accuracy Distribution by Polynomial Features - {algorithm}')
        
        # 3. Heatmap: Components vs Threshold for non-poly
        ax3 = plt.subplot(223)
        non_poly_data = alg_results[alg_results['poly_features'] == False]
        pivot_non_poly = non_poly_data.pivot(
            index='skew_threshold', 
            columns='n_components',
            values='accuracy'
        )
        sns.heatmap(pivot_non_poly, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax3)
        ax3.set_title(f'Accuracy Heatmap WITHOUT Polynomial Features - {algorithm}')
        
        # 4. Heatmap: Components vs Threshold for poly
        ax4 = plt.subplot(224)
        poly_data = alg_results[alg_results['poly_features'] == True]
        pivot_poly = poly_data.pivot(
            index='skew_threshold',
            columns='n_components',
            values='accuracy'
        )
        sns.heatmap(pivot_poly, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax4)
        ax4.set_title(f'Accuracy Heatmap WITH Polynomial Features - {algorithm}')

        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle
        plt.savefig(os.path.join(experiment_dir, f'{algorithm}_poly_comparison.png'))
        plt.close()

    # Create comparison visualizations between algorithms and polynomial features
    plt.figure(figsize=(20, 10))
    plt.suptitle('Algorithm and Polynomial Features Comparison', fontsize=16)

    # 1. Bar plot: Mean accuracy by Algorithm and Poly Features
    ax1 = plt.subplot(121)
    alg_poly_comparison = combined_results.groupby(['algorithm', 'poly_features'])['accuracy'].mean().reset_index()
    alg_poly_comparison['poly_features'] = alg_poly_comparison['poly_features'].map({False: 'Without Poly', True: 'With Poly'})
    sns.barplot(data=alg_poly_comparison, x='algorithm', y='accuracy', hue='poly_features', ax=ax1)
    ax1.set_title('Mean Accuracy by Algorithm and Polynomial Features')
    ax1.set_ylabel('Mean Accuracy')
    ax1.set_xlabel('Algorithm')
    
    # 2. Box plot: Accuracy distribution for each algorithm and poly feature
    ax2 = plt.subplot(122)
    combined_results_plot = combined_results.copy()
    combined_results_plot['algorithm_poly'] = combined_results_plot.apply(
        lambda row: f"{row['algorithm']} {'Poly' if row['poly_features'] else 'No-Poly'}", axis=1
    )
    sns.boxplot(data=combined_results_plot, x='algorithm_poly', y='accuracy', ax=ax2)
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
    ax2.set_title('Accuracy Distribution by Algorithm and Polynomial Features')
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('Algorithm & Polynomial Feature')

    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle
    plt.savefig(os.path.join(experiment_dir, 'algorithm_poly_comparison.png'))
    plt.close()

    # Find best configurations
    best_overall_idx = combined_results['accuracy'].idxmax()
    best_overall_config = combined_results.iloc[best_overall_idx]

    # Find best for each algorithm with and without polynomial features
    best_configs = {}
    for algorithm in algorithms:
        for use_poly in poly_features:
            filtered_df = combined_results[(combined_results['algorithm'] == algorithm) & 
                                          (combined_results['poly_features'] == use_poly)]
            if not filtered_df.empty:
                best_idx = filtered_df['accuracy'].idxmax()
                best_config = filtered_df.iloc[best_idx]
                key = f"{algorithm}_{'poly' if use_poly else 'no_poly'}"
                best_configs[key] = best_config

    print("\nBest Overall Configuration:")
    print(f"Algorithm: {best_overall_config['algorithm']}")
    print(f"Polynomial Features: {'Yes' if best_overall_config['poly_features'] else 'No'}")
    print(f"Skew Threshold: {best_overall_config['skew_threshold']}")
    print(f"Number of Components: {best_overall_config['n_components']}")
    print(f"Accuracy: {best_overall_config['accuracy']:.4f}")
    print(f"F1 Score: {best_overall_config['f1_score']:.4f}")
    print(f"Silhouette Score: {best_overall_config['silhouette_score']:.4f}")
    
    print("\nBest Configurations by Algorithm and Polynomial Features:")
    for config_name, config in best_configs.items():
        print(f"\n{config_name}:")
        print(f"Skew Threshold: {config['skew_threshold']}")
        print(f"Number of Components: {config['n_components']}")
        print(f"Accuracy: {config['accuracy']:.4f}")
        print(f"F1 Score: {config['f1_score']:.4f}")
        print(f"Silhouette Score: {config['silhouette_score']:.4f}")

    return experiment_dir, combined_results

In [21]:
experiment_dir, results = run_multi_threshold_experiment(
    "../Dataset/ABIDE2.csv",
    skew_thresholds=[5],
    component_range=[1, 2, 10, 20, 50, 100, 150, 200, 500],
    algorithms=["KMeans", "GMM"],
    poly_features=[False, True]
)

=== Experiment started at 20250512_012213 ===
Dataset: ../Dataset/ABIDE2.csv

--- Feature Transformation ---
Loading data from ../Dataset/ABIDE2.csv...
Original data shape: (1004, 1445)
Number of numeric features: 1443
Number of highly skewed features: 0


  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transf

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./multi_threshold_20250512_012213\experiment_20250512_012213\transformed_data.csv
Final data shape: (1004, 1444)

Transformation Summary:
Unnamed: 0: original
subject: original
age: original
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: original
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: original
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7

  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transf

Number of features selected for polynomial transformation: 66
Generating interaction terms...


  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** 

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./multi_threshold_20250512_012213\experiment_20250512_012225\transformed_data.csv
Final data shape: (1004, 3655)

Transformation Summary:
Unnamed: 0: original
subject: original
age: original
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: original
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: original
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7

  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transf

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./multi_threshold_20250512_012213\experiment_20250512_012307\transformed_data.csv
Final data shape: (1004, 1444)

Transformation Summary:
Unnamed: 0: original
subject: original
age: original
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: original
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: original
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7

  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transf

Number of features selected for polynomial transformation: 66
Generating interaction terms...


  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** 

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./multi_threshold_20250512_012213\experiment_20250512_012356\transformed_data.csv
Final data shape: (1004, 3655)

Transformation Summary:
Unnamed: 0: original
subject: original
age: original
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: original
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: original
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7

  ax2.set_xticklabels(['Without Poly', 'With Poly'])
  ax2.set_xticklabels(['Without Poly', 'With Poly'])



Best Overall Configuration:
Algorithm: KMeans
Polynomial Features: No
Skew Threshold: 5
Number of Components: 2
Accuracy: 0.5398
F1 Score: 0.5335
Silhouette Score: 0.3884

Best Configurations by Algorithm and Polynomial Features:

KMeans_no_poly:
Skew Threshold: 5
Number of Components: 2
Accuracy: 0.5398
F1 Score: 0.5335
Silhouette Score: 0.3884

KMeans_poly:
Skew Threshold: 5
Number of Components: 2
Accuracy: 0.5807
F1 Score: 0.5789
Silhouette Score: 0.4462

GMM_no_poly:
Skew Threshold: 5
Number of Components: 10
Accuracy: 0.5697
F1 Score: 0.4982
Silhouette Score: 0.2686

GMM_poly:
Skew Threshold: 5
Number of Components: 1
Accuracy: 0.5598
F1 Score: 0.4555
Silhouette Score: 0.5649


  ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
