In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from deap import base, creator, tools, algorithms
import random
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("final_processed_data.csv", low_memory=False)

# Blacklist of features
blacklist_features = ['EUR', 'TOTAL_PRICE', 'PRICE_PER_KM']

X = df.drop(columns=blacklist_features)
y = df['TOTAL_PRICE']

# Nested Cross-Validation setup
outer_cv = KFold(n_splits=5, shuffle=False)

# Genetic algorithm parameters
num_features = X.shape[1]
population_size = 20
num_generations = 10

# DEAP setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalFitness(individual, X_train, y_train, X_valid, y_valid):
    selected_indices = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_indices) == 0:
        return 1e6,  # penalty for no features

    X_tr_sel = X_train.iloc[:, selected_indices]
    X_val_sel = X_valid.iloc[:, selected_indices]

    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_tr_sel, y_train)
    preds = model.predict(X_val_sel)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape,

toolbox.register("evaluate", evalFitness, X_train=None, y_train=None, X_valid=None, y_valid=None)
toolbox.register("mate", tools.cxUniform, indpb=0.5)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

mape_scores = []

fold_number = 1
for train_index, test_index in outer_cv.split(X):
    print(f"\nProcessing Fold {fold_number}...")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Inner CV for evolutionary feature selection
    inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
    
    def fitness_wrapper(individual):
        inner_scores = []
        for inner_train_idx, inner_val_idx in inner_cv.split(X_train):
            X_inner_train, X_inner_val = X_train.iloc[inner_train_idx], X_train.iloc[inner_val_idx]
            y_inner_train, y_inner_val = y_train.iloc[inner_train_idx], y_train.iloc[inner_val_idx]
            
            score = evalFitness(individual, X_inner_train, y_inner_train, X_inner_val, y_inner_val)[0]
            inner_scores.append(score)
        return np.mean(inner_scores),

    toolbox.register("evaluate", fitness_wrapper)

    # GA execution
    population = toolbox.population(n=population_size)
    algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=num_generations, verbose=False)

    # Best feature set from GA
    best_individual = tools.selBest(population, 1)[0]
    selected_features_idx = [i for i, bit in enumerate(best_individual) if bit == 1]
    selected_feature_names = X.columns[selected_features_idx]

    print(f"Fold {fold_number}: Selected {len(selected_feature_names)} features")

    # Final evaluation on outer test fold
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train.iloc[:, selected_features_idx], y_train)
    y_pred = model.predict(X_test.iloc[:, selected_features_idx])
    fold_mape = mean_absolute_percentage_error(y_test, y_pred)

    print(f"Fold {fold_number}: MAPE = {fold_mape:.4f}")

    mape_scores.append(fold_mape)
    fold_number += 1

# Final results
print("\nCross-validation results:")
for i, score in enumerate(mape_scores, 1):
    print(f"Fold {i}: MAPE = {score:.4f}")

print(f"\nAverage MAPE across folds: {np.mean(mape_scores):.4f}")


Processing Fold 1...
Fold 1: Selected 28 features
Fold 1: MAPE = 0.3999

Processing Fold 2...
Fold 2: Selected 31 features
Fold 2: MAPE = 0.0570

Processing Fold 3...
Fold 3: Selected 29 features
Fold 3: MAPE = 0.0698

Processing Fold 4...
Fold 4: Selected 30 features
Fold 4: MAPE = 0.0706

Processing Fold 5...
Fold 5: Selected 28 features
Fold 5: MAPE = 0.0663

Cross-validation results:
Fold 1: MAPE = 0.3999
Fold 2: MAPE = 0.0570
Fold 3: MAPE = 0.0698
Fold 4: MAPE = 0.0706
Fold 5: MAPE = 0.0663

Average MAPE across folds: 0.1327
