In [19]:
# Import required libraries
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load dataset using ucimlrepo
def load_dataset():
    """Fetch and preprocess the splice-junction gene sequences dataset"""
    dataset = fetch_ucirepo(id=69)
    
    # Extract features and target
    X = dataset.data.features
    y = dataset.data.targets['class']
    
    # Convert DNA sequences to one-hot encoded matrix
    X_sequences = X.iloc[:, 0].apply(lambda x: list(x)).values.tolist()  # Fixed index from θ to 0
    
    # Updated OneHotEncoder with sparse_output instead of sparse
    nucleotide_encoder = OneHotEncoder(
        categories=[['A', 'C', 'G', 'T']]*60, 
        sparse_output=False  # Changed from sparse to sparse_output
    )
    X_encoded = nucleotide_encoder.fit_transform(X_sequences)
    
    # Encode class labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    return X_encoded, y_encoded

# PSO Optimizer with Early Stopping
class PSO_MLP_Optimizer:
    def __init__(self, X_train, X_val, y_train, y_val, 
                 n_particles=15, max_iter=50, c1=0.5, c2=0.3, w=0.9,
                 early_stopping_rounds=5, tolerance=1e-4):
        """
        Initialize PSO optimizer for MLP hyperparameter tuning
        
        Parameters:
        - X_train, y_train: Training data
        - X_val, y_val: Validation data for early stopping
        - n_particles: Number of particles in the swarm
        - max_iter: Maximum number of iterations
        - c1, c2: Cognitive and social coefficients
        - w: Inertia weight
        - early_stopping_rounds: Stop if no improvement for N rounds
        - tolerance: Minimum improvement to reset early stopping counter
        """
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.n_particles = n_particles
        self.max_iter = max_iter
        self.c1 = c1
        self.c2 = c2
        self.w = w
        self.early_stopping_rounds = early_stopping_rounds
        self.tolerance = tolerance
        
        # Search space boundaries
        self.hidden_min = 10
        self.hidden_max = 200
        self.alpha_min = 1e-6
        self.alpha_max = 1e-2
        
    def fitness_function(self, position):
        """Evaluate MLP performance for given hyperparameters"""
        hidden_size = int(position[0])
        alpha = 10**position[1]  # Convert from log scale
        
        # Create and train MLP with current parameters
        mlp = MLPClassifier(
            hidden_layer_sizes=(hidden_size,),
            alpha=alpha,
            max_iter=300,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=10,
            random_state=42
        )
        mlp.fit(self.X_train, self.y_train)
        return accuracy_score(self.y_val, mlp.predict(self.X_val))
    
    def optimize(self):
        """Run PSO optimization with early stopping"""
        # Initialize particles
        particles_pos = np.array([
            np.array([
                np.random.uniform(self.hidden_min, self.hidden_max),
                np.random.uniform(np.log10(self.alpha_min), np.log10(self.alpha_max))
            ]) for _ in range(self.n_particles)
        ])
        
        particles_vel = np.zeros((self.n_particles, 2))
        particles_best_pos = particles_pos.copy()
        particles_best_fitness = np.array([self.fitness_function(pos) for pos in particles_pos])
        
        global_best_idx = np.argmax(particles_best_fitness)
        global_best_pos = particles_best_pos[global_best_idx]
        global_best_fitness = particles_best_fitness[global_best_idx]
        
        # Early stopping variables
        no_improvement_count = 0
        best_iteration = 0
        history = []
        
        # PSO main loop
        for iteration in range(self.max_iter):
            for i in range(self.n_particles):
                # Update velocity
                r1, r2 = np.random.rand(2)
                cognitive = self.c1 * r1 * (particles_best_pos[i] - particles_pos[i])
                social = self.c2 * r2 * (global_best_pos - particles_pos[i])
                particles_vel[i] = self.w * particles_vel[i] + cognitive + social
                
                # Update position with bounds checking
                particles_pos[i] += particles_vel[i]
                particles_pos[i][0] = np.clip(particles_pos[i][0], self.hidden_min, self.hidden_max)
                particles_pos[i][1] = np.clip(particles_pos[i][1], 
                                            np.log10(self.alpha_min), 
                                            np.log10(self.alpha_max))
                
                # Evaluate new position
                current_fitness = self.fitness_function(particles_pos[i])
                
                # Update personal best
                if current_fitness > particles_best_fitness[i]:
                    particles_best_pos[i] = particles_pos[i].copy()
                    particles_best_fitness[i] = current_fitness
                    
                    # Update global best
                    if current_fitness > global_best_fitness + self.tolerance:
                        global_best_pos = particles_pos[i].copy()
                        global_best_fitness = current_fitness
                        best_iteration = iteration
                        no_improvement_count = 0
                    elif current_fitness > global_best_fitness:
                        # Small improvement that doesn't reset counter
                        global_best_pos = particles_pos[i].copy()
                        global_best_fitness = current_fitness
            
            # Early stopping check
            no_improvement_count += 1
            history.append(global_best_fitness)
            
            print(f"Iteration {iteration+1}: Best Accuracy = {global_best_fitness:.4f}")
            
            if no_improvement_count >= self.early_stopping_rounds:
                print(f"\nEarly stopping triggered at iteration {iteration+1}")
                print(f"No improvement for {self.early_stopping_rounds} consecutive iterations")
                break
        
        # Return best parameters and optimization history
        best_hidden = int(global_best_pos[0])
        best_alpha = 10**global_best_pos[1]
        return best_hidden, best_alpha, global_best_fitness, history, best_iteration

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    X, y = load_dataset()
    
    # Split data into train, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
    
    print(f"Data shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
    
    # Initialize and run PSO optimizer
    pso_optimizer = PSO_MLP_Optimizer(
        X_train=X_train,
        X_val=X_val,
        y_train=y_train,
        y_val=y_val,
        n_particles=15,
        max_iter=50,
        early_stopping_rounds=5,
        tolerance=1e-4
    )
    
    best_hidden, best_alpha, best_score, history, best_iter = pso_optimizer.optimize()
    
    print(f"\nOptimization Results:")
    print(f"- Best hidden layer size: {best_hidden}")
    print(f"- Best alpha: {best_alpha:.6f}")
    print(f"- Validation accuracy: {best_score:.4f}")
    print(f"- Best iteration: {best_iter + 1}")
    
    # Train final model on combined train+val data with best parameters
    final_mlp = MLPClassifier(
        hidden_layer_sizes=(best_hidden,),
        alpha=best_alpha,
        max_iter=1000,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=20,
        random_state=42
    )
    
    # Combine train and validation sets for final training
    X_final = np.vstack([X_train, X_val])
    y_final = np.concatenate([y_train, y_val])
    
    final_mlp.fit(X_final, y_final)
    
    # Evaluate on test set
    test_acc = final_mlp.score(X_test, y_test)
    print(f"\nFinal Model Performance:")
    print(f"- Training accuracy: {final_mlp.score(X_final, y_final):.4f}")
    print(f"- Test accuracy: {test_acc:.4f}")

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).