In [22]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

def transform_features(data_path, output_path=None, skew_threshold=1.0,
                       poly_degree=2, poly_threshold=0.1,
                       visualize=True, interaction_terms=True):
    """
    Automatically transform features based on their characteristics.

    Args:
        data_path (str): Path to the input CSV file
        output_path (str, optional): Path to save the transformed data. If None, will use 'transformed_[original_name].csv'
        skew_threshold (float): Threshold for considering a feature highly skewed
        poly_degree (int): Maximum degree for polynomial features
        poly_threshold (float): Correlation threshold for creating polynomial features
        visualize (bool): Whether to create visualization of original vs. transformed features
        interaction_terms (bool): Whether to create interaction terms between features

    Returns:
        pd.DataFrame: Transformed dataframe
    """
    # Set default output path
    if output_path is None:
        output_path = f"transformed_{data_path.split('/')[-1]}"

    # Load the data
    print(f"Loading data from {data_path}...")
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    else:
        df = pd.read_excel(data_path)

    print(f"Original data shape: {df.shape}")

    # Find the target column if it exists
    target_col = None
    if 'group' in df.columns:
        target_col = 'group'
        labels = df[target_col].copy()
        df_features = df.drop(columns=[target_col])
    else:
        df_features = df.copy()

    # Get numeric columns
    numeric_cols = df_features.select_dtypes(include=['number']).columns.tolist()
    print(f"Number of numeric features: {len(numeric_cols)}")

    # Calculate skewness for each feature
    skewness = df_features[numeric_cols].apply(lambda x: stats.skew(x.dropna()))

    # Identify highly skewed features
    highly_skewed = skewness[abs(skewness) > skew_threshold].index.tolist()
    print(f"Number of highly skewed features: {len(highly_skewed)}")

    # Create a new dataframe for transformed features
    transformed_df = pd.DataFrame(index=df_features.index)
    transformation_log = {}

    # Process each feature
    for col in numeric_cols:
        # Get the feature data
        x = df_features[col].values

        # Check if the feature is highly skewed
        if col in highly_skewed:
            # Check if the feature has non-positive values (can't apply log directly)
            if np.min(x) <= 0:
                # Shift data to make it positive
                shift = abs(np.min(x)) + 1.0
                x_transformed = np.log1p(x + shift)
                transformation_log[col] = f"log1p(x + {shift})"
            else:
                # Apply log transformation
                x_transformed = np.log1p(x)
                transformation_log[col] = "log1p(x)"

            # Add the transformed feature
            transformed_df[f"{col}"] = x_transformed
        else:
            # Keep the original feature
            transformed_df[f"{col}"] = x
            transformation_log[col] = "original"

    # Create polynomial features for a subset of important features
    # First, identify important features using correlation with target or variance
    if target_col is not None:
        # Use correlation with target to find important features
        important_features = []
        for col in transformed_df.columns:
            if abs(np.corrcoef(transformed_df[col], pd.get_dummies(labels).iloc[:, 0])[0, 1]) > poly_threshold:
                important_features.append(col)
    else:
        # Use variance as a measure of importance
        variances = transformed_df.var().sort_values(ascending=False)
        important_features = variances.index[:int(len(transformed_df.columns) * poly_threshold)].tolist()

    print(f"Number of features selected for polynomial transformation: {len(important_features)}")

    # Generate polynomial features
    for col in important_features:
        x = transformed_df[col].values
        for degree in range(2, poly_degree + 1):
            transformed_df[f"{col}_pow{degree}"] = x ** degree
            transformation_log[f"{col}_pow{degree}"] = f"{col}^{degree}"

    # Create interaction terms between important features if requested
    if interaction_terms and len(important_features) >= 2:
        print("Generating interaction terms...")
        for i in range(len(important_features)):
            for j in range(i+1, len(important_features)):
                col1, col2 = important_features[i], important_features[j]
                new_col = f"{col1}_mul_{col2}"
                transformed_df[new_col] = transformed_df[col1] * transformed_df[col2]
                transformation_log[new_col] = f"{col1} * {col2}"

    # Standardize all features
    print("Standardizing features...")
    for col in transformed_df.columns:
        mean = transformed_df[col].mean()
        std = transformed_df[col].std()
        if std > 0:  # Avoid division by zero
            transformed_df[col] = (transformed_df[col] - mean) / std

    # Add back the target column if it exists
    if target_col is not None:
        transformed_df[target_col] = labels

    # Save the transformed data
    transformed_df.to_csv(output_path, index=False)
    print(f"Transformed data saved to {output_path}")
    print(f"Final data shape: {transformed_df.shape}")

    # Print transformation summary
    print("\nTransformation Summary:")
    for col, transform in transformation_log.items():
        print(f"{col}: {transform}")

    # Create visualizations if requested
    if visualize:
        sample_cols = min(5, len(highly_skewed))
        if sample_cols > 0:
            plt.figure(figsize=(15, 3 * sample_cols))
            for i, col in enumerate(highly_skewed[:sample_cols]):
                # Original distribution
                plt.subplot(sample_cols, 2, 2*i + 1)
                plt.hist(df_features[col].dropna(), bins=30)
                plt.title(f"Original: {col}")

                # Transformed distribution
                plt.subplot(sample_cols, 2, 2*i + 2)
                plt.hist(transformed_df[col].dropna(), bins=30)
                plt.title(f"Transformed: {col}")

            plt.tight_layout()
            plt.savefig("feature_transformations.png")
            plt.close()
            print("Visualizations saved to 'feature_transformations.png'")

    return transformed_df

# Example usage:
# transformed_data = transform_features("cancer_dataset.csv", skew_threshold=1.0)


class MyPCA:
    def __init__(self, n_components):
        """
        Initialize PCA with the number of components.

        Args:
            n_components (int): Number of principal components to keep
        """
        self.n_components = n_components
        self.components_ = None
        self.mean_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None

    def fit(self, X):
        """
        Fit the PCA model with X.

        Args:
            X (array-like): Training data, shape (n_samples, n_features)

        Returns:
            self: Returns the instance itself
        """
        # Convert to numpy array if it's not
        X = np.array(X)

        # Store dimensions
        n_samples, n_features = X.shape

        # Center the data
        self.mean_ = np.mean(X, axis=0)
        X_centered = X - self.mean_

        # Compute covariance matrix
        cov_matrix = np.dot(X_centered.T, X_centered) / (n_samples - 1)

        # Compute eigenvalues and eigenvectors of covariance matrix
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        # Sort eigenvalues and eigenvectors in decreasing order
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

        # Store components (eigenvectors)
        self.components_ = eigenvectors[:, :self.n_components].T

        # Store eigenvalues
        self.explained_variance_ = eigenvalues[:self.n_components]

        # Calculate explained variance ratio
        self.explained_variance_ratio_ = self.explained_variance_ / np.sum(eigenvalues)

        # Calculate cumulative explained variance
        self.cumulative_explained_variance_ratio_ = np.cumsum(self.explained_variance_ratio_)

        # Store singular values
        self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_)

        return self

    def transform(self, X):
        """
        Apply dimensionality reduction to X.

        Args:
            X (array-like): Data to transform, shape (n_samples, n_features)

        Returns:
            X_new (array-like): Transformed data, shape (n_samples, n_components)
        """
        # Convert to numpy array if it's not
        X = np.array(X)

        # Center the data
        X_centered = X - self.mean_

        # Project the data onto the principal components
        X_transformed = np.dot(X_centered, self.components_.T)

        return X_transformed

    def fit_transform(self, X):
        """
        Fit the model with X and apply the dimensionality reduction on X.

        Args:
            X (array-like): Training data, shape (n_samples, n_features)

        Returns:
            X_new (array-like): Transformed data, shape (n_samples, n_components)
        """
        self.fit(X)
        return self.transform(X)


def run_experiment(dataset_path, output_dir='./', skew_threshold=1.0,
                  component_range=None, k=2, visualize=True):
    """
    Run a complete experiment with feature transformation, PCA, and clustering.

    Args:
        dataset_path (str): Path to the input dataset CSV
        output_dir (str): Directory to save outputs
        skew_threshold (float): Threshold for considering a feature highly skewed
        component_range (list): List of n_components values to test
        k (int): Number of clusters
        visualize (bool): Whether to create visualizations
    """
    import os
    from datetime import datetime

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate a timestamp for the experiment
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(output_dir, f"experiment_{timestamp}")
    os.makedirs(experiment_dir, exist_ok=True)

    # Define file paths
    transformed_path = os.path.join(experiment_dir, "transformed_data.csv")
    results_path = os.path.join(experiment_dir, "experiment_results.csv")
    log_path = os.path.join(experiment_dir, "experiment_log.txt")

    # Set up logging
    with open(log_path, 'w') as log_file:
        def log(message):
            print(message)
            log_file.write(message + '\n')
            log_file.flush()

        log(f"=== Experiment started at {timestamp} ===")
        log(f"Dataset: {dataset_path}")

        # Load and transform features
        log("\n--- Feature Transformation ---")
        transformed_data = transform_features(
            dataset_path,
            output_path=transformed_path,
            skew_threshold=skew_threshold,
            visualize=visualize
        )

        # Extract features and target
        if 'group' in transformed_data.columns:
            y = transformed_data['group'].copy()
            X = transformed_data.drop(columns=['group'])
        else:
            y = None
            X = transformed_data

        log(f"Transformed data shape: {X.shape}")

        # Determine component range if not provided
        if component_range is None:
            max_components = min(X.shape[0], X.shape[1])
            component_range = [10, 20, 30, 50, 70, 100, 150]
            component_range = [c for c in component_range if c <= max_components]

        # Prepare results storage
        results = []

        # Run PCA with different n_components
        log("\n--- PCA and Clustering Experiments ---")
        for n_components in component_range:
            log(f"\nTesting with n_components={n_components}")

            # Perform PCA
            pca = MyPCA(n_components=n_components)
            X_pca = pca.fit_transform(X)

            # Record basic PCA stats
            evr = pca.explained_variance_ratio_.sum()
            log(f"Explained variance ratio: {evr:.4f}")

            # Implement KMeans clustering
            from scipy.spatial.distance import cdist

            # Initialize centroids randomly
            centroids = X_pca[np.random.choice(X_pca.shape[0], k, replace=False)]
            prev_labels = None

            # Kmeans iterations
            for i in range(100):  # Max iterations
                # Assign points to nearest centroid
                distances = cdist(X_pca, centroids)
                labels = np.argmin(distances, axis=1)

                # Check convergence
                if prev_labels is not None and np.array_equal(labels, prev_labels):
                    log(f"K-means converged after {i} iterations")
                    break

                # Update centroids
                prev_labels = labels
                for j in range(k):
                    points = X_pca[labels == j]
                    if len(points) > 0:
                        centroids[j] = points.mean(axis=0)

            # Evaluate clustering if we have true labels
            if y is not None:
                # Convert labels to numeric if they're categorical
                if not pd.api.types.is_numeric_dtype(y):
                    label_map = {label: i for i, label in enumerate(y.unique())}
                    y_numeric = y.map(label_map)
                else:
                    y_numeric = y

                # Calculate accuracy (after finding best label mapping)
                from scipy.optimize import linear_sum_assignment

                # Create confusion matrix
                conf_matrix = np.zeros((k, k))
                for i in range(len(labels)):
                    conf_matrix[labels[i], y_numeric.iloc[i]] += 1

                # Find optimal assignment
                row_ind, col_ind = linear_sum_assignment(-conf_matrix)

                # Remap cluster labels
                remapped_labels = np.zeros_like(labels)
                for i in range(k):
                    remapped_labels[labels == row_ind[i]] = col_ind[i]

                # Calculate accuracy
                accuracy = np.sum(remapped_labels == y_numeric) / len(y_numeric)
                log(f"Clustering accuracy: {accuracy:.4f}")

                # Calculate F1 score
                from sklearn.metrics import f1_score
                f1 = f1_score(y_numeric, remapped_labels, average='weighted')
                log(f"F1 score: {f1:.4f}")

                # Calculate silhouette score
                from sklearn.metrics import silhouette_score
                silhouette = silhouette_score(X_pca, labels)
                log(f"Silhouette score: {silhouette:.4f}")

                # Store results
                results.append({
                    'n_components': n_components,
                    'explained_variance_ratio': evr,
                    'accuracy': accuracy,
                    'f1_score': f1,
                    'silhouette_score': silhouette
                })

            # Visualize 2D projection for the first experiment
            if visualize and n_components >= 2:
                plt.figure(figsize=(16, 6))

                # Plot clustering results
                plt.subplot(1, 2, 1)
                plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7)
                plt.title(f"Clustering Results (n_components={n_components})")
                plt.xlabel("PC1")
                plt.ylabel("PC2")
                plt.colorbar()

                # Plot true labels if available
                if y is not None:
                    plt.subplot(1, 2, 2)
                    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_numeric, cmap='viridis', alpha=0.7)
                    plt.title("True Labels")
                    plt.xlabel("PC1")
                    plt.ylabel("PC2")
                    plt.colorbar()

                plt.tight_layout()
                plt.savefig(os.path.join(experiment_dir, f"pca_n{n_components}.png"))
                plt.close()

        # Save results to CSV
        if results:
            results_df = pd.DataFrame(results)
            results_df.to_csv(results_path, index=False)
            log(f"\nResults saved to {results_path}")

            # Find the best configuration
            best_accuracy_idx = results_df['accuracy'].idxmax()
            best_config = results_df.iloc[best_accuracy_idx]
            log(f"\nBest configuration:")
            log(f"n_components: {best_config['n_components']}")
            log(f"Accuracy: {best_config['accuracy']:.4f}")
            log(f"F1 score: {best_config['f1_score']:.4f}")
            log(f"Silhouette score: {best_config['silhouette_score']:.4f}")

            # Plot metrics vs n_components
            if visualize and len(component_range) > 1:
                plt.figure(figsize=(15, 10))

                # Plot explained variance vs components
                plt.subplot(2, 2, 1)
                plt.plot(results_df['n_components'], results_df['explained_variance_ratio'], 'o-')
                plt.title('Explained Variance vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Cumulative Explained Variance')

                # Plot accuracy vs components
                plt.subplot(2, 2, 2)
                plt.plot(results_df['n_components'], results_df['accuracy'], 'o-')
                plt.title('Accuracy vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Accuracy')

                # Plot F1 score vs components
                plt.subplot(2, 2, 3)
                plt.plot(results_df['n_components'], results_df['f1_score'], 'o-')
                plt.title('F1 Score vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('F1 Score')

                # Plot silhouette score vs components
                plt.subplot(2, 2, 4)
                plt.plot(results_df['n_components'], results_df['silhouette_score'], 'o-')
                plt.title('Silhouette Score vs. Components')
                plt.xlabel('Number of Components')
                plt.ylabel('Silhouette Score')

                plt.tight_layout()
                plt.savefig(os.path.join(experiment_dir, "metrics_vs_components.png"))
                plt.close()

        log(f"\n=== Experiment completed ===")
        return experiment_dir

In [26]:
# Example usage:
experiment_dir = run_experiment("../Dataset/ABIDE2.csv", skew_threshold=0.75, component_range=[1, 2, 5, 10, 20, 50, 100, 200, 500, 1000])

=== Experiment started at 20250511_114458 ===
Dataset: ../Dataset/ABIDE2.csv

--- Feature Transformation ---
Loading data from ../Dataset/ABIDE2.csv...
Original data shape: (1004, 1445)
Number of numeric features: 1443
Number of highly skewed features: 1220


  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transformed_df[f"{col}"] = x_transformed
  transform

Number of features selected for polynomial transformation: 88
Generating interaction terms...


  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** 

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./experiment_20250511_114458\transformed_data.csv
Final data shape: (1004, 5360)

Transformation Summary:
Unnamed: 0: original
subject: log1p(x)
age: log1p(x)
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: log1p(x)
fsArea_L_V6_ROI: log1p(x)
fsArea_L_V2_ROI: log1p(x)
fsArea_L_V3_ROI: log1p(x)
fsArea_L_V4_ROI: log1p(x)
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: log1p(x)
fsArea_L_3b_ROI: log1p(x)
fsArea_L_FEF_ROI: log1p(x)
fsArea_L_PEF_ROI: log1p(x)
fsArea_L_55b_ROI: log1p(x)
fsArea_L_V3A_ROI: log1p(x)
fsArea_L_RSC_ROI: log1p(x)
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: log1p(x)
fsArea_L_IPS1_ROI: log1p(x)
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: log1p(x)
fsArea_L_LO1_ROI: log1p(x)
fsArea_L_LO2_ROI: log1p(x)
fsArea_L_PIT_ROI: log1p(x)
fsArea_L_MT_ROI: log1p(x)
fsArea_L_A1_ROI: log1p(x)
fsArea_L_PSL_ROI: log1p(x)
fsArea_L_SFL_ROI: log1p(x)
fsArea_L_PCV_ROI: log1p(x)
fsArea_L_STV_ROI: log1p(x)
fsArea_L_7Pm_ROI: log1p(x)
fsArea_L_7m_ROI: original
fsArea_L_POS1_RO

In [29]:
import seaborn as sns

def run_multi_threshold_experiment(dataset_path, output_dir='./',
                                 skew_thresholds=[0.5, 0.75, 1.0, 1.25, 1.5],
                                 component_range=[10, 20, 30, 50, 100],
                                 k=2):
    """
    Run experiments with multiple skew thresholds and visualize results.

    Args:
        dataset_path (str): Path to dataset
        output_dir (str): Output directory
        skew_thresholds (list): List of skew thresholds to test
        component_range (list): List of n_components values
        k (int): Number of clusters
    """
    import os
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(output_dir, f"multi_threshold_{timestamp}")
    os.makedirs(experiment_dir, exist_ok=True)

    # Store results for all thresholds
    all_results = []

    # Run experiments for each threshold
    for threshold in skew_thresholds:
        exp_dir = run_experiment(
            dataset_path,
            output_dir=experiment_dir,
            skew_threshold=threshold,
            component_range=component_range,
            k=k,
            visualize=False
        )

        # Read results
        results_df = pd.read_csv(os.path.join(exp_dir, "experiment_results.csv"))
        results_df['skew_threshold'] = threshold
        all_results.append(results_df)

    # Combine all results
    combined_results = pd.concat(all_results)

    # Create visualizations
    plt.figure(figsize=(20, 15))

    # 1. 3D Surface plot: Components vs Threshold vs Accuracy
    ax1 = plt.subplot(221, projection='3d')
    X, Y = np.meshgrid(component_range, skew_thresholds)
    Z = combined_results.pivot(
        index='skew_threshold',
        columns='n_components',
        values='accuracy'
    ).values
    surf = ax1.plot_surface(X, Y, Z, cmap='viridis')
    ax1.set_xlabel('Components')
    ax1.set_ylabel('Skew Threshold')
    ax1.set_zlabel('Accuracy')
    ax1.set_title('Accuracy Surface Plot')
    plt.colorbar(surf, ax=ax1)

    # 2. Heatmap: Components vs Threshold
    ax2 = plt.subplot(222)
    pivot_data = combined_results.pivot(
        index='skew_threshold',
        columns='n_components',
        values='accuracy'
    )
    sns.heatmap(pivot_data, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax2)
    ax2.set_title('Accuracy Heatmap')

    # 3. Line plot: Accuracy vs Components for different thresholds
    ax3 = plt.subplot(223)
    for threshold in skew_thresholds:
        threshold_data = combined_results[combined_results['skew_threshold'] == threshold]
        ax3.plot(threshold_data['n_components'],
                threshold_data['accuracy'],
                'o-',
                label=f'Threshold={threshold}')
    ax3.set_xlabel('Number of Components')
    ax3.set_ylabel('Accuracy')
    ax3.set_title('Accuracy vs Components')
    ax3.legend()

    # 4. Box plot: Accuracy distribution for each threshold
    ax4 = plt.subplot(224)
    sns.boxplot(data=combined_results, x='skew_threshold', y='accuracy', ax=ax4)
    ax4.set_title('Accuracy Distribution by Threshold')

    plt.tight_layout()
    plt.savefig(os.path.join(experiment_dir, 'multi_threshold_analysis.png'))
    plt.close()

    # Save combined results
    combined_results.to_csv(os.path.join(experiment_dir, 'combined_results.csv'), index=False)

    # Find best configuration
    best_idx = combined_results['accuracy'].idxmax()
    best_config = combined_results.iloc[best_idx]

    print("\nBest Configuration Found:")
    print(f"Skew Threshold: {best_config['skew_threshold']}")
    print(f"Number of Components: {best_config['n_components']}")
    print(f"Accuracy: {best_config['accuracy']:.4f}")
    print(f"F1 Score: {best_config['f1_score']:.4f}")
    print(f"Silhouette Score: {best_config['silhouette_score']:.4f}")

    return experiment_dir, combined_results

In [31]:
experiment_dir, results = run_multi_threshold_experiment(
    "../Dataset/ABIDE2.csv",
    skew_thresholds=[5],
    component_range=[1, 2, 10, 20, 50, 100, 150, 200, 500, 1000],
)

=== Experiment started at 20250511_121530 ===
Dataset: ../Dataset/ABIDE2.csv

--- Feature Transformation ---
Loading data from ../Dataset/ABIDE2.csv...
Original data shape: (1004, 1445)
Number of numeric features: 1443
Number of highly skewed features: 0


  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transformed_df[f"{col}"] = x
  transf

Number of features selected for polynomial transformation: 66
Generating interaction terms...


  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** degree
  transformed_df[f"{col}_pow{degree}"] = x ** 

Standardizing features...


  transformed_df[target_col] = labels


Transformed data saved to ./multi_threshold_20250511_121530\experiment_20250511_121530\transformed_data.csv
Final data shape: (1004, 3655)

Transformation Summary:
Unnamed: 0: original
subject: original
age: original
fsArea_L_V1_ROI: original
fsArea_L_MST_ROI: original
fsArea_L_V6_ROI: original
fsArea_L_V2_ROI: original
fsArea_L_V3_ROI: original
fsArea_L_V4_ROI: original
fsArea_L_V8_ROI: original
fsArea_L_4_ROI: original
fsArea_L_3b_ROI: original
fsArea_L_FEF_ROI: original
fsArea_L_PEF_ROI: original
fsArea_L_55b_ROI: original
fsArea_L_V3A_ROI: original
fsArea_L_RSC_ROI: original
fsArea_L_POS2_ROI: original
fsArea_L_V7_ROI: original
fsArea_L_IPS1_ROI: original
fsArea_L_FFC_ROI: original
fsArea_L_V3B_ROI: original
fsArea_L_LO1_ROI: original
fsArea_L_LO2_ROI: original
fsArea_L_PIT_ROI: original
fsArea_L_MT_ROI: original
fsArea_L_A1_ROI: original
fsArea_L_PSL_ROI: original
fsArea_L_SFL_ROI: original
fsArea_L_PCV_ROI: original
fsArea_L_STV_ROI: original
fsArea_L_7Pm_ROI: original
fsArea_L_7