<a href="https://colab.research.google.com/github/Feranie/Hierarchical-Classification-Project/blob/main/Inconsistency%20Rate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import time
import random
from collections import defaultdict, Counter
from typing import List, Tuple, Dict, Any

# --- ARFF File Reading ---
def read_arff_file(file_path):
    """
    Read ARFF file and return pandas DataFrame.
    """
    data = []
    attributes = []
    current_section = None

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith('%'):
                continue

            if '@relation' in line.lower():
                current_section = 'relation'
            elif '@attribute' in line.lower():
                current_section = 'attribute'
                match = re.match(r'@attribute\s+([^\s]+)\s+.*', line, re.IGNORECASE)
                if match:
                    attributes.append(match.group(1))
            elif '@data' in line.lower():
                current_section = 'data'
            elif current_section == 'data':
                values = re.findall(r'[^,]+(?:,(?=[^,]$))?', line)
                values = [v.strip('" ') for v in values]
                if len(values) == len(attributes):
                    data.append(values)

    return pd.DataFrame(data, columns=attributes)

# --- ARFF File Saving ---
def save_to_arff(df, file_path, relation_name="filtered_data"):
    """
    Save DataFrame to ARFF file format.
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(f"@relation {relation_name}\n\n")

        for column in df.columns:
            unique_values = df[column].unique()
            if all(isinstance(val, str) for val in unique_values):
                vals = ','.join(sorted(set(unique_values)))
                f.write(f"@attribute {column} {{{vals}}}\n")
            else:
                f.write(f"@attribute {column} numeric\n")

        f.write("\n@data\n")
        for _, row in df.iterrows():
            f.write(",".join(map(str, row)) + "\n")

class ConsistencyMeasure:
    """
    Implementation of the consistency measure (inconsistency rate) as described in:
    M. Dash, H. Liu / Artificial Intelligence 151 (2003) 155–176
    """

    def __init__(self, data: pd.DataFrame, class_column: str):
        """
        Initialize the consistency measure calculator.

        Args:
            data: DataFrame containing the dataset
            class_column: Name of the class/target column
        """
        self.data = data
        self.class_column = class_column
        self.feature_columns = [col for col in data.columns if col != class_column]
        self.total_instances = len(data)

    def calculate_inconsistency_rate(self, feature_subset: List[str]) -> float:
        """
        Calculate the inconsistency rate for a given feature subset.

        Args:
            feature_subset: List of feature names to consider

        Returns:
            Inconsistency rate (IR) for the feature subset
        """
        if not feature_subset:
            return 0.0

        # Validate feature subset
        invalid_features = [f for f in feature_subset if f not in self.feature_columns]
        if invalid_features:
            raise ValueError(f"Invalid features: {invalid_features}")

        # Group instances by pattern (combination of feature values)
        patterns = self._group_by_pattern(feature_subset)

        # Calculate inconsistency count for each pattern
        total_inconsistency_count = 0

        for pattern, instances in patterns.items():
            inconsistency_count = self._calculate_pattern_inconsistency(instances)
            total_inconsistency_count += inconsistency_count

        # Calculate inconsistency rate
        inconsistency_rate = total_inconsistency_count / self.total_instances
        return inconsistency_rate

    def _group_by_pattern(self, feature_subset: List[str]) -> Dict[Tuple, List[Any]]:
        """
        Group instances by their pattern (feature value combinations).

        Args:
            feature_subset: List of feature names

        Returns:
            Dictionary mapping patterns to lists of class labels
        """
        patterns = defaultdict(list)

        for _, row in self.data.iterrows():
            # Create pattern tuple from feature values
            pattern = tuple(row[feature] for feature in feature_subset)
            class_label = row[self.class_column]
            patterns[pattern].append(class_label)

        return patterns

    def _calculate_pattern_inconsistency(self, class_labels: List[Any]) -> int:
        """
        Calculate inconsistency count for a single pattern.

        Args:
            class_labels: List of class labels for instances with the same pattern

        Returns:
            Inconsistency count for this pattern
        """
        if len(class_labels) <= 1:
            return 0

        # Count occurrences of each class label
        label_counts = Counter(class_labels)

        # Find the maximum count (most frequent class)
        max_count = max(label_counts.values())

        # Inconsistency count = total instances - max count
        inconsistency_count = len(class_labels) - max_count

        return inconsistency_count

    def analyze_pattern_details(self, feature_subset: List[str]) -> Dict:
        """
        Provide detailed analysis of patterns and their inconsistencies.

        Args:
            feature_subset: List of feature names to analyze

        Returns:
            Dictionary with detailed pattern analysis
        """
        patterns = self._group_by_pattern(feature_subset)
        analysis = {
            'total_patterns': len(patterns),
            'inconsistent_patterns': 0,
            'pattern_details': [],
            'total_inconsistency_count': 0
        }

        for pattern, class_labels in patterns.items():
            label_counts = Counter(class_labels)
            inconsistency_count = self._calculate_pattern_inconsistency(class_labels)

            pattern_info = {
                'pattern': pattern,
                'total_instances': len(class_labels),
                'class_distribution': dict(label_counts),
                'inconsistency_count': inconsistency_count,
                'is_inconsistent': inconsistency_count > 0
            }

            analysis['pattern_details'].append(pattern_info)
            analysis['total_inconsistency_count'] += inconsistency_count

            if inconsistency_count > 0:
                analysis['inconsistent_patterns'] += 1

        analysis['inconsistency_rate'] = analysis['total_inconsistency_count'] / self.total_instances

        return analysis

    def compare_feature_subsets(self, feature_subsets: List[List[str]]) -> pd.DataFrame:
        """
        Compare inconsistency rates across multiple feature subsets.

        Args:
            feature_subsets: List of feature subset lists to compare

        Returns:
            DataFrame with comparison results
        """
        results = []

        for i, subset in enumerate(feature_subsets):
            ir = self.calculate_inconsistency_rate(subset)
            results.append({
                'subset_id': i,
                'features': subset,
                'num_features': len(subset),
                'inconsistency_rate': ir,
                'consistency_score': 1 - ir  # Higher is better
            })

        return pd.DataFrame(results).sort_values('inconsistency_rate')

class RandomRestartHillClimbing:
    """
    Random Restart Hill Climbing algorithm to find optimal feature subset
    that minimizes inconsistency rate (IRH).
    """

    def __init__(self, consistency_measure: ConsistencyMeasure, max_restarts=10,
                 max_iterations=100, min_features=1):
        """
        Initialize RRHC optimizer.

        Args:
            consistency_measure: ConsistencyMeasure instance
            max_restarts: Maximum number of random restarts
            max_iterations: Maximum iterations per restart
            min_features: Minimum number of features to keep
        """
        self.cm = consistency_measure
        self.max_restarts = max_restarts
        self.max_iterations = max_iterations
        self.min_features = min_features
        self.all_features = self.cm.feature_columns.copy()

    def optimize(self, verbose=True):
        """
        Run Random Restart Hill Climbing to find optimal feature subset.

        Args:
            verbose: Print progress information

        Returns:
            Dictionary with optimization results
        """
        best_subset = None
        best_ir = float('inf')
        best_restart = -1
        all_results = []

        for restart in range(self.max_restarts):
            if verbose:
                print(f"\n--- Restart {restart + 1}/{self.max_restarts} ---")

            # Generate random initial subset
            initial_size = random.randint(self.min_features, len(self.all_features))
            current_subset = random.sample(self.all_features, initial_size)
            current_ir = self.cm.calculate_inconsistency_rate(current_subset)

            if verbose:
                print(f"Initial subset: {len(current_subset)} features, IR: {current_ir:.4f}")

            # Hill climbing from this starting point
            result = self._hill_climb(current_subset, current_ir, verbose)
            all_results.append(result)

            # Update global best
            if result['final_ir'] < best_ir:
                best_ir = result['final_ir']
                best_subset = result['final_subset'].copy()
                best_restart = restart

            if verbose:
                print(f"Final: {len(result['final_subset'])} features, IR: {result['final_ir']:.4f}")

        return {
            'best_subset': best_subset,
            'best_ir': best_ir,
            'best_restart': best_restart,
            'all_results': all_results,
            'total_evaluations': sum(r['evaluations'] for r in all_results)
        }

    def _hill_climb(self, initial_subset, initial_ir, verbose=False):
        """
        Perform hill climbing from an initial subset.
        """
        current_subset = initial_subset.copy()
        current_ir = initial_ir
        iteration = 0
        evaluations = 1  # Count initial evaluation
        improvements = 0

        while iteration < self.max_iterations:
            iteration += 1
            improved = False

            # Generate all neighbors (add/remove one feature)
            neighbors = self._generate_neighbors(current_subset)

            # Evaluate all neighbors
            for neighbor in neighbors:
                if len(neighbor) < self.min_features:
                    continue

                neighbor_ir = self.cm.calculate_inconsistency_rate(neighbor)
                evaluations += 1

                # Accept if better (lower IR)
                if neighbor_ir < current_ir:
                    current_subset = neighbor.copy()
                    current_ir = neighbor_ir
                    improved = True
                    improvements += 1

                    if verbose and iteration % 10 == 0:
                        print(f"  Iteration {iteration}: {len(current_subset)} features, IR: {current_ir:.4f}")
                    break

            if not improved:
                break

        return {
            'final_subset': current_subset,
            'final_ir': current_ir,
            'iterations': iteration,
            'evaluations': evaluations,
            'improvements': improvements
        }

    def _generate_neighbors(self, current_subset):
        """
        Generate neighbor subsets by adding/removing one feature.
        """
        neighbors = []
        current_set = set(current_subset)

        # Remove one feature (if possible)
        if len(current_subset) > self.min_features:
            for feature in current_subset:
                neighbor = [f for f in current_subset if f != feature]
                neighbors.append(neighbor)

        # Add one feature
        available_features = [f for f in self.all_features if f not in current_set]
        for feature in available_features:
            neighbor = current_subset + [feature]
            neighbors.append(neighbor)

        return neighbors

def generate_feature_subsets(feature_columns, max_subset_size=None, include_all_sizes=True):
    """
    Generate feature subsets automatically from the dataset columns.

    Args:
        feature_columns: List of feature column names from the dataset
        max_subset_size: Maximum size of subsets to generate (default: all features)
        include_all_sizes: Whether to include subsets of all sizes from 1 to max_subset_size

    Returns:
        List of feature subsets
    """
    from itertools import combinations

    if max_subset_size is None:
        max_subset_size = len(feature_columns)

    feature_subsets = []

    if include_all_sizes:
        # Generate subsets of all sizes from 1 to max_subset_size
        for size in range(1, min(max_subset_size + 1, len(feature_columns) + 1)):
            for subset in combinations(feature_columns, size):
                feature_subsets.append(list(subset))
    else:
        # Generate only subsets of max_subset_size
        for subset in combinations(feature_columns, max_subset_size):
            feature_subsets.append(list(subset))

    return feature_subsets

def generate_smart_feature_subsets(feature_columns, max_combinations=200, include_large_subsets=True):
    """
    Generate a smart selection of feature subsets for large datasets.
    Uses strategic sampling to avoid combinatorial explosion.

    Args:
        feature_columns: List of feature column names
        max_combinations: Maximum number of subsets to generate
        include_large_subsets: Whether to include some larger subsets

    Returns:
        List of feature subsets
    """
    from itertools import combinations
    import random

    feature_subsets = []
    n_features = len(feature_columns)

    print(f"Génération intelligente de sous-ensembles pour {n_features} attributs...")

    # Always include individual features
    print("Ajout des attributs individuels...")
    for feature in feature_columns:
        feature_subsets.append([feature])

    remaining_budget = max_combinations - len(feature_subsets)

    # Add pairs strategically
    if n_features <= 20:
        # Small enough to include all pairs
        print("Ajout de toutes les paires...")
        for pair in combinations(feature_columns, 2):
            feature_subsets.append(list(pair))
            remaining_budget -= 1
            if remaining_budget <= 0:
                break
    else:
        # Sample pairs
        pair_budget = min(100, remaining_budget // 2)
        print(f"Échantillonnage de {pair_budget} paires...")
        all_pairs = list(combinations(feature_columns, 2))
        sampled_pairs = random.sample(all_pairs, min(pair_budget, len(all_pairs)))
        for pair in sampled_pairs:
            feature_subsets.append(list(pair))
        remaining_budget -= len(sampled_pairs)

    # Add some larger subsets if requested and budget allows
    if include_large_subsets and remaining_budget > 0:
        for size in [3, 4, 5, 10, 15, 20]:
            if size > n_features or remaining_budget <= 0:
                break

            size_budget = min(20, remaining_budget // (6 - (size - 3)),
                            remaining_budget if size >= 10 else remaining_budget)

            if size_budget > 0:
                print(f"Échantillonnage de {size_budget} sous-ensembles de taille {size}...")
                all_combinations = list(combinations(feature_columns, size))
                sample_size = min(size_budget, len(all_combinations))
                sampled_combinations = random.sample(all_combinations, sample_size)

                for combo in sampled_combinations:
                    feature_subsets.append(list(combo))

                remaining_budget -= sample_size

    # Add some random subsets of various sizes
    if remaining_budget > 0:
        print(f"Ajout de {remaining_budget} sous-ensembles aléatoires...")
        for _ in range(remaining_budget):
            size = random.randint(2, min(25, n_features))
            random_subset = random.sample(feature_columns, size)
            feature_subsets.append(random_subset)

    print(f"Total généré: {len(feature_subsets)} sous-ensembles")
    return feature_subsets

def evaluate_subsets_with_progress(cm, feature_subsets, max_time_minutes=30):
    """
    Evaluate feature subsets with progress tracking and time limit.

    Args:
        cm: ConsistencyMeasure instance
        feature_subsets: List of feature subsets to evaluate
        max_time_minutes: Maximum time to spend on evaluation

    Returns:
        DataFrame with results
    """
    import time

    results = []
    start_time = time.time()
    max_time_seconds = max_time_minutes * 60

    print(f"\nÉvaluation de {len(feature_subsets)} sous-ensembles...")
    print(f"Temps maximum alloué: {max_time_minutes} minutes")

    for i, subset in enumerate(feature_subsets):
        current_time = time.time()
        elapsed = current_time - start_time

        # Check time limit
        if elapsed > max_time_seconds:
            print(f"\nArrêt dû à la limite de temps ({max_time_minutes} min)")
            print(f"Évalué {i} sous-ensembles sur {len(feature_subsets)}")
            break

        # Calculate inconsistency rate
        try:
            ir = cm.calculate_inconsistency_rate(subset)
            results.append({
                'subset_id': i,
                'features': subset,
                'num_features': len(subset),
                'inconsistency_rate': ir,
                'consistency_score': 1 - ir
            })

            # Progress update every 50 evaluations
            if (i + 1) % 50 == 0 or i == 0:
                avg_time_per_eval = elapsed / (i + 1)
                remaining_evals = len(feature_subsets) - (i + 1)
                eta_seconds = remaining_evals * avg_time_per_eval
                eta_minutes = eta_seconds / 60

                progress_pct = ((i + 1) / len(feature_subsets)) * 100
                print(f"Progression: {progress_pct:.1f}% ({i + 1}/{len(feature_subsets)}) - "
                      f"ETA: {eta_minutes:.1f} min - "
                      f"Meilleur IR: {min(r['inconsistency_rate'] for r in results):.4f}")

        except Exception as e:
            print(f"Erreur lors de l'évaluation du sous-ensemble {i}: {e}")
            continue

    if results:
        df_results = pd.DataFrame(results)
        return df_results.sort_values('inconsistency_rate')
    else:
        return pd.DataFrame()

# Example usage and demonstration
def demonstrate_consistency_measure(data=None, class_column=None):
    """
    Demonstrate the consistency measure with provided data or example data.
    """
    if data is None:
        # Create sample dataset
        np.random.seed(42)

        # Generate sample data with known inconsistencies
        data = pd.DataFrame({
            'feature1': [0, 0, 1, 1, 0, 0, 1, 1, 0, 1],
            'feature2': [1, 1, 0, 0, 1, 1, 0, 0, 1, 0],
            'feature3': [1, 0, 1, 0, 1, 0, 1, 0, 1, 1],
            'class': [1, 0, 1, 0, 1, 1, 1, 0, 0, 1]  # Intentional inconsistencies
        })
        class_column = 'class'
        print("Sample Dataset:")
        print(data)
    else:
        print("Using provided dataset:")
        print(f"Shape: {data.shape}")
        print(f"Features: {[col for col in data.columns if col != class_column]}")
        print(f"Class column: {class_column}")

    print("\n" + "="*50)

    # Initialize consistency measure
    cm = ConsistencyMeasure(data, class_column)

    # Generate feature subsets automatically from the dataset
    print(f"\nGenerating feature subsets from {len(cm.feature_columns)} features...")

    if len(cm.feature_columns) <= 5:
        # For small datasets, generate all possible subsets
        print("Small dataset: generating all possible subsets")
        feature_subsets = generate_feature_subsets(cm.feature_columns, max_subset_size=len(cm.feature_columns))
    elif len(cm.feature_columns) <= 15:
        # For medium datasets, generate subsets up to size 4
        print("Medium dataset: generating subsets up to size 4")
        feature_subsets = generate_feature_subsets(cm.feature_columns, max_subset_size=4)
    else:
        # For large datasets, use smart sampling
        print("Large dataset: using smart subset sampling")
        feature_subsets = generate_smart_feature_subsets(cm.feature_columns, max_combinations=50)

    print(f"Generated {len(feature_subsets)} feature subsets")

    # Limit the number of subsets for demonstration
    if len(feature_subsets) > 20:
        print(f"Limiting to first 20 subsets for demonstration...")
        feature_subsets = feature_subsets[:20]

    print("\nTesting feature subsets:")
    for i, subset in enumerate(feature_subsets[:10]):  # Show first 10
        print(f"  {i+1}: {subset}")
    if len(feature_subsets) > 10:
        print(f"  ... and {len(feature_subsets) - 10} more subsets")

    print("\n" + "="*50)

    print("\nInconsistency Rates for Feature Subsets:")
    comparison = cm.compare_feature_subsets(feature_subsets)

    # Show top 10 best subsets
    print("\nTop 10 best feature subsets (lowest inconsistency rate):")
    print(comparison.head(10).to_string(index=False))

    print("\n" + "="*50)

    # Detailed analysis for the best subset
    best_subset = comparison.iloc[0]['features']
    print(f"\nDetailed Analysis for Best Subset: {best_subset}")
    analysis = cm.analyze_pattern_details(best_subset)

    print(f"Total patterns: {analysis['total_patterns']}")
    print(f"Inconsistent patterns: {analysis['inconsistent_patterns']}")
    print(f"Inconsistency rate: {analysis['inconsistency_rate']:.4f}")
    print(f"Consistency score: {1 - analysis['inconsistency_rate']:.4f}")

    if analysis['inconsistent_patterns'] > 0:
        print("\nInconsistent Patterns (first 5):")
        inconsistent_patterns = [detail for detail in analysis['pattern_details']
                               if detail['is_inconsistent']][:5]
        for i, detail in enumerate(inconsistent_patterns):
            print(f"  {i+1}. Pattern {detail['pattern']}: {detail['total_instances']} instances")
            print(f"     Class distribution: {detail['class_distribution']}")
            print(f"     Inconsistency count: {detail['inconsistency_count']}")

    return comparison

# --- Main Function ---
def main():
    """
    Main function to load data, optimize features, and save results.
    """
    input_file_path = '/content/GPCR-PrositeTRA0.arff'
    output_file_path = '/content/GPCR-PrositeTRA0OptimizedRRHCIR.arff'

    print("Chargement des données...")
    start_time = time.time()
    data = read_arff_file(input_file_path)
    load_time = time.time() - start_time
    print(f"Chargé {len(data)} instances avec {len(data.columns)-1} attributs en {load_time:.2f}s")

    # Assume last column is class (adjust if needed)
    class_column = data.columns[-1]
    print(f"Colonne de classe: {class_column}")
    print(f"Distribution des classes: {data[class_column].value_counts().to_dict()}")

    # Initialize consistency measure
    print("\nInitialisation de la mesure de cohérence...")
    cm = ConsistencyMeasure(data, class_column)

    # Calculate initial inconsistency rate with all features
    print("Calcul du taux d'incohérence initial...")
    initial_ir = cm.calculate_inconsistency_rate(cm.feature_columns)
    print(f"Taux d'incohérence initial (tous les {len(cm.feature_columns)} attributs): {initial_ir:.4f}")

    print("\n" + "="*60)
    print("PHASE 1: ÉVALUATION INTELLIGENTE DES SOUS-ENSEMBLES")
    print("="*60)

    # Generate feature subsets intelligently based on dataset size
    if len(cm.feature_columns) <= 10:
        print("Petit dataset: génération exhaustive de tous les sous-ensembles")
        feature_subsets = generate_feature_subsets(cm.feature_columns)
        print(f"Générés {len(feature_subsets)} sous-ensembles")
        subset_comparison = cm.compare_feature_subsets(feature_subsets)
    else:
        print("Grand dataset: génération intelligente de sous-ensembles")
        feature_subsets = generate_smart_feature_subsets(cm.feature_columns, max_combinations=500)

        # Evaluate with progress tracking and time limit
        subset_comparison = evaluate_subsets_with_progress(cm, feature_subsets, max_time_minutes=15)

        if subset_comparison.empty:
            print("Aucun résultat obtenu, utilisation d'un échantillon plus petit...")
            feature_subsets = generate_smart_feature_subsets(cm.feature_columns, max_combinations=100)
            subset_comparison = evaluate_subsets_with_progress(cm, feature_subsets, max_time_minutes=5)

    if not subset_comparison.empty:
        print(f"\nÉvaluation terminée: {len(subset_comparison)} sous-ensembles testés")

        # Show top results
        print("\nTop 10 meilleurs sous-ensembles:")
        top_subsets = subset_comparison.head(10)
        for idx, row in top_subsets.iterrows():
            improvement = ((initial_ir - row['inconsistency_rate']) / initial_ir * 100)
            print(f"  {idx+1}: {row['num_features']} attributs, IR: {row['inconsistency_rate']:.4f} "
                  f"(amélioration: {improvement:.1f}%)")

        # Get the best subset
        best_exhaustive_subset = subset_comparison.iloc[0]['features']
        best_exhaustive_ir = subset_comparison.iloc[0]['inconsistency_rate']

        print(f"\nMeilleur sous-ensemble trouvé:")
        print(f"  Nombre d'attributs: {len(best_exhaustive_subset)}")
        print(f"  Taux d'incohérence: {best_exhaustive_ir:.4f}")
        print(f"  Amélioration: {((initial_ir - best_exhaustive_ir) / initial_ir * 100):.2f}%")
    else:
        print("Aucun résultat d'évaluation disponible")
        best_exhaustive_subset = cm.feature_columns[:10]  # Fallback
        best_exhaustive_ir = cm.calculate_inconsistency_rate(best_exhaustive_subset)

    # Initialize and run Random Restart Hill Climbing
    print("\n" + "="*60)
    print("PHASE 2: OPTIMISATION RRHC")
    print("="*60)
    print("Lancement de l'optimisation Random Restart Hill Climbing...")

    optimizer = RandomRestartHillClimbing(
        consistency_measure=cm,
        max_restarts=10,  # Reduced for large datasets
        max_iterations=50,  # Reduced for large datasets
        min_features=1
    )

    optimization_start = time.time()
    result = optimizer.optimize(verbose=True)
    optimization_time = time.time() - optimization_start

    # Display results
    print("\n" + "="*60)
    print("RÉSULTATS DE L'OPTIMISATION")
    print("="*60)
    print(f"Meilleur sous-ensemble trouvé: {len(result['best_subset'])} attributs")
    print(f"Attributs sélectionnés: {result['best_subset']}")
    print(f"Taux d'incohérence optimisé: {result['best_ir']:.4f}")
    # Compare methods and select best solution
    if not subset_comparison.empty:
        print("\n" + "="*60)
        print("COMPARAISON DES MÉTHODES")
        print("="*60)
        print(f"Évaluation intelligente:")
        print(f"  Meilleur sous-ensemble: {len(best_exhaustive_subset)} attributs")
        print(f"  Taux d'incohérence: {best_exhaustive_ir:.4f}")
        print(f"\nRandom Restart Hill Climbing:")
        print(f"  Meilleur sous-ensemble: {len(result['best_subset'])} attributs")
        print(f"  Taux d'incohérence: {result['best_ir']:.4f}")
        print(f"  Évaluations totales: {result['total_evaluations']}")
        print(f"  Temps d'optimisation: {optimization_time:.2f}s")

        if result['best_ir'] <= best_exhaustive_ir:
            print(f"✓ RRHC a trouvé une solution égale ou meilleure!")
            final_best_subset = result['best_subset']
            final_best_ir = result['best_ir']
        else:
            print(f"✓ L'évaluation intelligente a trouvé une meilleure solution!")
            improvement_possible = ((result['best_ir'] - best_exhaustive_ir) / result['best_ir'] * 100)
            print(f"  Amélioration de RRHC possible: {improvement_possible:.2f}%")
            final_best_subset = best_exhaustive_subset
            final_best_ir = best_exhaustive_ir
    else:
        print("\n" + "="*60)
        print("RÉSULTATS RRHC SEULEMENT")
        print("="*60)
        final_best_subset = result['best_subset']
        final_best_ir = result['best_ir']
        print(f"Évaluations totales: {result['total_evaluations']}")
        print(f"Temps d'optimisation: {optimization_time:.2f}s")

    print(f"\nSOLUTION FINALE:")
    print(f"  Nombre d'attributs sélectionnés: {len(final_best_subset)}")
    print(f"  Taux d'incohérence final: {final_best_ir:.4f}")
    print(f"  Amélioration vs initial: {((initial_ir - final_best_ir) / initial_ir * 100):.2f}%")
    print(f"  Réduction d'attributs: {((len(cm.feature_columns) - len(final_best_subset)) / len(cm.feature_columns) * 100):.1f}%")

    # Create optimized dataset with the final best subset
    selected_columns = final_best_subset + [class_column]
    optimized_data = data[selected_columns]

    # Save optimized dataset
    print(f"\nSauvegarde du dataset optimisé...")
    save_to_arff(optimized_data, output_file_path, "optimized_data_RRHC")
    print(f"Dataset sauvegardé: {output_file_path}")

    # Additional analysis
    print("\n" + "="*60)
    print("ANALYSE DÉTAILLÉE")
    print("="*60)

    # Pattern analysis for optimized subset
    print(f"\nAnalyse des motifs pour le sous-ensemble optimisé:")
    pattern_analysis = cm.analyze_pattern_details(final_best_subset)
    print(f"  Motifs totaux: {pattern_analysis['total_patterns']}")
    print(f"  Motifs incohérents: {pattern_analysis['inconsistent_patterns']}")
    print(f"  Score de cohérence: {(1 - pattern_analysis['inconsistency_rate']):.4f}")

    # Show some inconsistent patterns if they exist
    if pattern_analysis['inconsistent_patterns'] > 0:
        print(f"\nExemples de motifs incohérents (premiers 3):")
        inconsistent_patterns = [detail for detail in pattern_analysis['pattern_details']
                               if detail['is_inconsistent']][:3]
        for i, detail in enumerate(inconsistent_patterns):
            print(f"  {i+1}. Motif {detail['pattern']}: {detail['total_instances']} instances")
            print(f"     Distribution des classes: {detail['class_distribution']}")
            print(f"     Nombre d'incohérences: {detail['inconsistency_count']}")

    # Show selected features
    print(f"\nAttributs sélectionnés dans le meilleur sous-ensemble:")
    for i, feature in enumerate(final_best_subset, 1):
        print(f"  {i:2d}. {feature}")

    # Comparison by subset size (if we have evaluation data)
    if not subset_comparison.empty and len(subset_comparison) > 10:
        print(f"\nAnalyse par taille de sous-ensemble:")
        size_analysis = {}
        for _, row in subset_comparison.iterrows():
            size = row['num_features']
            if size not in size_analysis:
                size_analysis[size] = []
            size_analysis[size].append(row['inconsistency_rate'])

        for size in sorted(size_analysis.keys())[:10]:  # Show first 10 sizes
            rates = size_analysis[size]
            avg_rate = np.mean(rates)
            min_rate = min(rates)
            print(f"  Taille {size:2d}: IR moyen = {avg_rate:.4f}, IR minimum = {min_rate:.4f} "
                  f"({len(rates)} sous-ensembles)")

    return {
        'optimized_data': optimized_data,
        'best_subset': final_best_subset,
        'best_ir': final_best_ir,
        'initial_ir': initial_ir,
        'improvement': ((initial_ir - final_best_ir) / initial_ir * 100),
        'subset_comparison': subset_comparison,
        'rrhc_result': result
    }

if __name__ == "__main__":
    # You can also run the demonstration with sample data
    # demonstrate_consistency_measure()

    # Run the main optimization
    main()