In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.feature_selection import RFECV

# Define the Grey Wolf Optimizer (GWO) class
def GWO(func, bounds, population_size=30, generations=50):
    """
    Grey Wolf Optimizer algorithm.
    func: Objective function to minimize.
    bounds: Tuple of lower and upper bounds for each parameter.
    population_size: Number of wolves in the population.
    generations: Number of iterations to run the algorithm.
    """
    num_params = len(bounds)
    population = np.random.uniform(
        low=[b[0] for b in bounds], high=[b[1] for b in bounds], size=(population_size, num_params)
    )
    scores = np.apply_along_axis(func, 1, population)

    # Sort population based on fitness (ascending order)
    sorted_indices = np.argsort(scores)
    alpha, beta, delta = population[sorted_indices[:3]]

    for generation in range(generations):
        a = 2 - generation * (2 / generations)  # Linearly decreases from 2 to 0
        for i in range(population_size):
            for leader, coeff in zip([alpha, beta, delta], [1.5, 1.0, 0.5]):
                r1, r2 = np.random.rand(), np.random.rand()
                A = 2 * a * r1 - a
                C = 2 * r2
                D = abs(C * leader - population[i])
                X = leader - A * D
                population[i] = np.clip(X, [b[0] for b in bounds], [b[1] for b in bounds])

            scores[i] = func(population[i])

        # Update alpha, beta, delta
        sorted_indices = np.argsort(scores)
        alpha, beta, delta = population[sorted_indices[:3]]

    return alpha, scores[sorted_indices[0]]

# Objective function for optimization
def objective_function(params):
    rf_n_estimators, xgb_learning_rate = int(params[0]), params[1]
    base_learners = [
        ('rf', RandomForestRegressor(n_estimators=rf_n_estimators, random_state=42)),
        ('xgb', lgb.LGBMRegressor(n_estimators=200, learning_rate=xgb_learning_rate, random_state=42)),
        ('lr', LinearRegression())
    ]
    stacking_model = StackingRegressor(estimators=base_learners, final_estimator=LinearRegression())
    stacking_model.fit(X_train_scaled_selected, y_train)
    y_pred_log = stacking_model.predict(X_test_scaled_selected)
    y_pred = np.expm1(y_pred_log)
    mse = mean_squared_error(np.expm1(y_test), y_pred)
    return mse

# Define features (X) and target (y)
dataset = pd.read_csv('Height New.csv')
X = dataset.iloc[:, :3].values
y = dataset.iloc[:, 3].values

# Apply log transformation to the target variable
y_log = np.log1p(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection using RFECV
selector = RFECV(estimator=RandomForestRegressor(), step=1, cv=5)
X_train_scaled_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_scaled_selected = selector.transform(X_test_scaled)

# Optimize hyperparameters using GWO
bounds = [(100, 500), (0.01, 0.1)]  # Bounds for RF n_estimators and XGB learning_rate
best_params, best_mse = GWO(objective_function, bounds, population_size=20, generations=30)

# Print the results
print("Best Parameters (RF n_estimators, XGB learning_rate):", best_params)
print("Best MSE Achieved:", best_mse)

# Final Model Evaluation
rf_n_estimators, xgb_learning_rate = int(best_params[0]), best_params[1]
base_learners = [
    ('rf', RandomForestRegressor(n_estimators=rf_n_estimators, random_state=42)),
    ('xgb', lgb.LGBMRegressor(n_estimators=200, learning_rate=xgb_learning_rate, random_state=42)),
    ('lr', LinearRegression())
]
stacking_model = StackingRegressor(estimators=base_learners, final_estimator=LinearRegression())
stacking_model.fit(X_train_scaled_selected, y_train)
y_pred_log = stacking_model.predict(X_test_scaled_selected)
y_pred = np.expm1(y_pred_log)
final_mse = mean_squared_error(np.expm1(y_test), y_pred)
final_r2 = r2_score(np.expm1(y_test), y_pred)

# Print final evaluation results
print("Final Mean Squared Error:", final_mse)
print("Final R-squared Value:", final_r2)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 12, number of used features: 0
[LightGBM] [Info] Start training from score 4.679290
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 9, number of used features: 0
[LightGBM] [Info] Start training from score 4.680312
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 9, number of used features: 0
[LightGBM] [Info] Start training from score 4.681657
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 10, number of used features: 0
[LightGBM] [Info] Start training from score 4.677708
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 10, number of used features: 0
[LightGBM] [Info] Start training from score 4.677154
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points i