In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('c:/Users/USER/Documents/Data Science Journy/StackingOptimization/driving_data.csv')
df = data

In [3]:

# Separate features and target variable
X = df.drop(columns=['Class'])  # Features
y = df['Class']  # Target

# Encode the target variable (Class)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Feature scaling (standardization)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature Selection
# Step 1: Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y_encoded)
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Select features above an importance threshold
threshold = 0.01
important_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']
X_important = X[important_features]

# Step 2: RFE on Important Features
rfe = RFE(estimator=rf_model, n_features_to_select=10)     # 10 Features to selected.
X_rfe = rfe.fit_transform(X_important, y_encoded)
selected_features = important_features[rfe.support_]

print("Final Selected Features:", selected_features)


from sklearn.naive_bayes import GaussianNB
# Select these features "selected_features" from X_scaled
X_scaled_selected = pd.DataFrame(X_scaled, columns=X.columns)[selected_features]

# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled_selected, y_encoded, test_size=0.2, random_state=42)

# Define Available Models
available_models = [
    ("rf", RandomForestClassifier(random_state=42)),
    ("xgb", XGBClassifier(random_state=42)),
    ("lr", LogisticRegression()),
    ("svm", SVC(probability=True, random_state=42)),
    ("knn", KNeighborsClassifier()),
    ("nb", GaussianNB())

#    ("rf", RandomForestClassifier(random_state=42)),
#    ("lr", LogisticRegression()),
#    ("svc", SVC(probability=True, random_state=42)),
#    ("knn", KNeighborsClassifier())
]

# Define Meta Learner
meta_learner = LogisticRegression()



def fitness_function(params):
    """
    Example fitness function for CSA. This evaluates the fitness of a single crow's position.
    """
    selected_models = [

        ("rf", RandomForestClassifier(random_state=42)) if params[0] > 0.5 else None,
        ("xgb", XGBClassifier(random_state=42)) if params[1] > 0.5 else None,
        ("lr", LogisticRegression()) if params[2] > 0.5 else None,
        ("svm", SVC(random_state=42)) if params[3] > 0.5 else None,
        ("knn", KNeighborsClassifier()) if params[4] > 0.5 else None,
        ("nb", GaussianNB()) if params[5] > 0.5 else None

        #("rf", RandomForestClassifier(random_state=42)) if params[0] > 0.5 else None,
        #("lr", LogisticRegression(random_state=42)) if params[1] > 0.5 else None,
        #("svc", SVC()) if params[2] > 0.5 else None,
        #("knn", KNeighborsClassifier()) if params[3] > 0.5 else None
    ]
    selected_models = [model for model in selected_models if model is not None]

    if not selected_models:  # Penalize if no models are selected
        return float('inf')

    # Create stacking model
    stacking_model = StackingClassifier(estimators=selected_models, final_estimator=LogisticRegression(), cv=5)

    # Evaluate with cross-validation
    from sklearn.model_selection import cross_val_score
    try:
        X_train_cv, X_val, y_train_cv, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
        stacking_model.fit(X_train_cv, y_train_cv)
        score = stacking_model.score(X_val, y_val)  # Use validation accuracy
        return -score  # Minimize negative accuracy

        #scores = cross_val_score(stacking_model, X_train, y_train, cv=3, scoring='accuracy')
        #return -scores.mean()  # Minimize negative accuracy
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return float('inf')


from joblib import Parallel, delayed

class ParallelCrowSearchAlgorithm:
    def __init__(self, n_crows, n_variables, lower_bound, upper_bound, max_iter, fitness_function):
        self.n_crows = n_crows
        self.n_variables = n_variables
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.max_iter = max_iter
        self.fitness_function = fitness_function

        # Initialize positions and memory
        self.positions = np.random.uniform(low=lower_bound, high=upper_bound, size=(n_crows, n_variables))
        self.memory = np.copy(self.positions)
        self.memory_fitness = np.full(n_crows, np.inf)

    def optimize(self):
        for iteration in range(self.max_iter):
            print(f"Iteration {iteration + 1}/{self.max_iter}")

            # Evaluate fitness for all crows in parallel
            fitness_values = Parallel(n_jobs=-1)(delayed(self.fitness_function)(params) for params in self.positions)

            # Update memory if fitness improves
            for i, fitness in enumerate(fitness_values):
                if fitness < self.memory_fitness[i]:
                    self.memory[i] = self.positions[i]
                    self.memory_fitness[i] = fitness

            # Update positions
            for i in range(self.n_crows):
                random_crow = np.random.randint(0, self.n_crows)
                r = np.random.uniform(0, 1)
                new_position = self.positions[i] + r * (self.memory[random_crow] - self.positions[i])

                # Clip to bounds
                new_position = np.clip(new_position, self.lower_bound, self.upper_bound)
                self.positions[i] = new_position

        # Return the best solution
        best_index = np.argmin(self.memory_fitness)
        return self.memory[best_index], self.memory_fitness[best_index]


# Initialize Parallel CSA
csa = ParallelCrowSearchAlgorithm(
    n_crows=5,               # Number of crows
    n_variables=6,            # 4 binary variables for model selection
    lower_bound=[0, 0, 0, 0, 0, 0], # Lower bounds
    upper_bound=[1, 1, 1, 1, 1, 1], # Upper bounds
    max_iter=5,              # Maximum iterations
    fitness_function=fitness_function
)

# Optimize
best_params, best_fitness = csa.optimize()
print("Best Parameters (Model Selection):", best_params)
print("Best Fitness (Accuracy):", -best_fitness)


selected_models = [

        ("rf", RandomForestClassifier(random_state=42)) if best_params[0] > 0.5 else None,
        ("xgb", XGBClassifier(random_state=42)) if best_params[1] > 0.5 else None,
        ("lr", LogisticRegression()) if best_params[2] > 0.5 else None,
        ("svm", SVC(random_state=42)) if best_params[3] > 0.5 else None,
        ("knn", KNeighborsClassifier()) if best_params[4] > 0.5 else None,
        ("nb", GaussianNB()) if best_params[5] > 0.5 else None

    #("RandomForest" if best_params[0] > 0.5 else None),
    #("LogisticRegression" if best_params[1] > 0.5 else None),
    #("SVC" if best_params[2] > 0.5 else None),
    #("KNeighborsClassifier" if best_params[3] > 0.5 else None)
]

selected_models = [model for model in selected_models if model is not None]
print("Selected Models:", selected_models)


# Train Final Model with Selected Models

optimized_model = StackingClassifier(
    estimators=selected_models,
    final_estimator=meta_learner,
    cv=5
)
optimized_model.fit(X_train, y_train)

# Test Final Model
y_pred = optimized_model.predict(X_test)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Selected Models:", selected_models)
print("\nOptimized Stacking Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Final Selected Features: 7                Engine_soacking_time
11          Long_Term_Fuel_Trim_Bank1
34       Engine_coolant_temperature.1
14                 Torque_of_friction
51                            Time(s)
4                 Intake_air_pressure
52                          PathOrder
22    Maximum_indicated_engine_torque
6          Absolute_throttle_position
35     Wheel_velocity_front_left-hand
Name: Feature, dtype: object
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Best Parameters (Model Selection): [0.565875   0.06645818 0.97067514 0.83356297 0.39552836 0.95539619]
Best Fitness (Accuracy): 0.9978147142573339
Selected Models: [('rf', RandomForestClassifier(random_state=42)), ('lr', LogisticRegression()), ('svm', SVC(random_state=42)), ('nb', GaussianNB())]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Models: [('rf', RandomForestClassifier(random_state=42)), ('lr', LogisticRegression()), ('svm', SVC(random_state=42)), ('nb', GaussianNB())]

Optimized Stacking Model Performance:
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983
F1 Score: 0.9983
