In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


In [24]:
data = pd.read_csv('c:/Users/USER/Documents/Data Science Journy/StackingOptimization/driving_data.csv')

In [25]:
df = data

In [27]:
# Separate features and target variable
X = df.drop(columns=['Class'])  # Features
y = df['Class']  # Target

In [28]:
# Encode the target variable (Class)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [29]:
# Feature scaling (standardization)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
# Feature Selection
# Step 1: Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y_encoded)

In [31]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

In [32]:
# Select features above an importance threshold
threshold = 0.01
important_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']
X_important = X[important_features]

In [33]:
# Step 2: RFE on Important Features
rfe = RFE(estimator=rf_model, n_features_to_select=10)     # 10 Features to selected. 
X_rfe = rfe.fit_transform(X_important, y_encoded)
selected_features = important_features[rfe.support_]

print("Final Selected Features:", selected_features)

Final Selected Features: 7                Engine_soacking_time
11          Long_Term_Fuel_Trim_Bank1
34       Engine_coolant_temperature.1
14                 Torque_of_friction
51                            Time(s)
4                 Intake_air_pressure
52                          PathOrder
22    Maximum_indicated_engine_torque
6          Absolute_throttle_position
35     Wheel_velocity_front_left-hand
Name: Feature, dtype: object


In [34]:
# Select these features "selected_features" from X_scaled
X_scaled_selected = pd.DataFrame(X_scaled, columns=X.columns)[selected_features]
#X_scaled_selected.head()

In [35]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled_selected, y_encoded, test_size=0.2, random_state=42)

In [36]:
# Define Available Models
available_models = [
    ("rf", RandomForestClassifier(random_state=42)),
    ("lr", LogisticRegression()),
    ("svc", SVC(probability=True, random_state=42)),
    ("xgb", XGBClassifier(random_state=42))
]

# Define Meta Learner
meta_learner = LogisticRegression()

In [37]:
# Crow Search Algorithm for Model Selection
class CrowSearchAlgorithm:
    def __init__(self, n_crows, n_variables, lower_bound, upper_bound, max_iter, fitness_function):
        self.n_crows = n_crows
        self.n_variables = n_variables
        self.lower_bound = np.array(lower_bound)
        self.upper_bound = np.array(upper_bound)
        self.max_iter = max_iter
        self.fitness_function = fitness_function

        # Initialize positions and memory
        self.positions = np.random.uniform(low=self.lower_bound, high=self.upper_bound, size=(n_crows, n_variables))
        self.memory = np.copy(self.positions)
        self.memory_fitness = np.full(n_crows, np.inf)

    def optimize(self):
        for t in range(self.max_iter):
            for i in range(self.n_crows):
                # Generate new position
                rand_j = np.random.randint(0, self.n_crows)
                r = np.random.uniform(0, 1)
                new_position = self.positions[i] + r * (self.memory[rand_j] - self.positions[i])

                # Apply bounds
                new_position = np.clip(new_position, self.lower_bound, self.upper_bound)

                # Evaluate fitness
                new_fitness = self.fitness_function(new_position)

                # Update position and memory if fitness improves
                if new_fitness < self.memory_fitness[i]:
                    self.positions[i] = new_position
                    self.memory[i] = new_position
                    self.memory_fitness[i] = new_fitness

        # Return the best solution
        best_index = np.argmin(self.memory_fitness)
        return self.memory[best_index], self.memory_fitness[best_index]

In [41]:
# Run Crow Search Algorithm for Model Selection
csa = CrowSearchAlgorithm(
    n_crows=5,
    n_variables=4,  # One variable per model
    lower_bound=[0, 0, 0, 0],
    upper_bound=[1, 1, 1, 1],
    max_iter=5,
    fitness_function=fitness_function
)

#csa = CrowSearchAlgorithm(
#    n_crows=10,  # Smaller population
#    n_variables=2,  # Fewer models or hyperparameters
#    lower_bound = [0.1, 1],  # Reasonable starting values
#    upper_bound = [1.0, 10],   
#    max_iter=20,  # Fewer iterations
#    fitness_function=fitness_function
#)

In [44]:
best_params, best_fitness = csa.optimize()
print("Best Parameters (Model Selection):", best_params)
print("Best Fitness (Accuracy):", -best_fitness)

selected_models = [
    ("RandomForest" if best_params[0] > 0.5 else None),
    ("LogisticRegression" if best_params[1] > 0.5 else None),
    ("SVC" if best_params[2] > 0.5 else None),
    ("XGB" if best_params[3] > 0.5 else None)
]

#selected_models = [
 #       ("rf", RandomForestClassifier(random_state=42)) if params[0] > 0.5 else None,
 #       ("xgb", XGBClassifier(random_state=42)) if params[1] > 0.5 else None,
 #       ("lr", LogisticRegression()) if params[2] > 0.5 else None,
 #       ("svm", SVC(probability=True, random_state=42)) if params[3] > 0.5 else None,
 #       ("knn", KNeighborsClassifier()) if params[4] > 0.5 else None,
 #       ("nb", GaussianNB()) if params[5] > 0.5 else None
 #   ]

selected_models = [model for model in selected_models if model is not None]
print("Selected Models:", selected_models)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters (Model Selection): [0.18920361 0.88930813 0.         0.        ]
Best Fitness (Accuracy): 0.5196344612939541
Selected Models: ['LogisticRegression']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
# Train Final Model with Selected Models
selected_models = [
    ("RandomForest", RandomForestClassifier(random_state=42)) if best_params[0] > 0.5 else None,
    ("LogisticRegression", LogisticRegression()) if best_params[1] > 0.5 else None,
    ("SVC", SVC(probability=True, random_state=42)) if best_params[2] > 0.5 else None,
    ("XGB", XGBClassifier(random_state=42)) if best_params[3] > 0.5 else None
]
selected_models = [model for model in selected_models if model is not None]


optimized_model = StackingClassifier(
    estimators=selected_models,
    final_estimator=meta_learner,
    cv=5
)
optimized_model.fit(X_train, y_train)

# Test Final Model
y_pred = optimized_model.predict(X_test)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Selected Models:", selected_models)
print("\nOptimized Stacking Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Models: [('LogisticRegression', LogisticRegression())]

Optimized Stacking Model Performance:
Accuracy: 0.5250
Precision: 0.5253
Recall: 0.5250
F1 Score: 0.5224


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
