In [2]:
import random
from collections import namedtuple
from pathlib import Path
import warnings

import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [3]:
dataset_path = Path('dataset/dataset_versions/polynomial_11_interpolation_rescaled_dataset.xlsx')
dataset = pd.read_excel(dataset_path)
dataset = dataset.iloc[:, 1:]
dataset

Unnamed: 0,Adjusted net national income (annual % growth),Adjusted net national income (constant 2015 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2015 US$),Adjusted net national income per capita (current US$),"Adjusted net savings, excluding particulate emission damage (current US$)","Adjusted net savings, including particulate emission damage (current US$)",Adjusted savings: carbon dioxide damage (current US$),Adjusted savings: consumption of fixed capital (current US$),...,"Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (national estimate)","Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth total (% of total labor force ages 15-24) (national estimate)","Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)"
0,0.689136,0.000857,0.000130,0.692521,0.002794,0.310620,0.000531,0.066474,0.000099,0.962256,...,0.220160,0.028734,0.280167,0.025974,0.118821,0.207872,0.162652,0.920258,0.865579,0.895318
1,0.689147,0.000993,0.000169,0.692533,0.003089,0.311263,0.000531,0.066474,0.000224,0.962269,...,0.220160,0.028734,0.280167,0.025974,0.118821,0.207872,0.162652,0.920258,0.865579,0.895318
2,0.689141,0.001141,0.000254,0.692528,0.003403,0.312697,0.000531,0.066474,0.000421,0.962298,...,0.220160,0.028734,0.280167,0.025974,0.118821,0.207872,0.162652,0.920258,0.865579,0.895318
3,0.689034,0.001213,0.000362,0.692423,0.003500,0.314469,0.000531,0.066474,0.000625,0.962332,...,0.220160,0.028734,0.280167,0.025974,0.118821,0.207872,0.162652,0.920258,0.865579,0.895318
4,0.689003,0.001259,0.000435,0.692393,0.003530,0.315559,0.000531,0.066474,0.000836,0.962360,...,0.220160,0.028734,0.280167,0.025974,0.118821,0.207872,0.162652,0.920258,0.865579,0.895318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,0.688948,0.000591,0.001035,0.692353,0.003757,0.343549,0.000450,0.066226,0.038400,0.962863,...,0.228317,0.028757,0.292249,0.026008,0.114790,0.202871,0.158013,0.923805,0.869330,0.899066
250,0.688995,0.000613,0.001239,0.692400,0.003806,0.349701,0.000468,0.066242,0.039675,0.962956,...,0.228362,0.028757,0.292205,0.026008,0.114882,0.202909,0.158078,0.923701,0.869301,0.899008
251,0.688947,0.000610,0.001302,0.692354,0.003753,0.351157,0.000446,0.066221,0.040959,0.963005,...,0.228431,0.028757,0.292267,0.026008,0.114886,0.203049,0.158152,0.923772,0.869209,0.898991
252,0.688959,0.000613,0.001234,0.692367,0.003712,0.348408,0.000437,0.066213,0.041808,0.962978,...,0.229280,0.028759,0.293309,0.026010,0.114948,0.203117,0.158220,0.923647,0.869076,0.898862


In [4]:
test_size = 0.2
seed = 7
target_feature_name = 'GDP per capita (current US$)'

SplittedDataset = namedtuple('SplittedDataset', ['name', 'x_train', 'y_train', 'x_test', 'y_test'])
model = dict()
model['name'] = 'polynomial_11_interpolation'
data_x = dataset.drop([target_feature_name], axis=1)
data_y = dataset[target_feature_name]
model['x_train'], model['x_test'], model['y_train'], model['y_test'] = train_test_split(data_x, data_y, test_size=test_size, random_state=seed)
splitted_dataset = SplittedDataset(model['name'], model['x_train'],  model['y_train'], model['x_test'], model['y_test'])

In [38]:
dataset = splitted_dataset
loss_variants = ['linear', 'square', 'exponential']

#  Genetic algorithm settings
population_size = 100
generation_size = 50
crossing_prob = 0.7
mutation_prob = 0.2

def calculate_dest_ind_params(individual):
    n_estimators = int(abs(individual[0]) % 10_000) or 1
    loss = loss_variants[int(abs(individual[1]) % 3)]
    learning_rate =  float(abs(individual[2]))
    return n_estimators, loss, learning_rate


def evaluate(individual):
    n_estimators, loss, learning_rate =  calculate_dest_ind_params(individual)
    model = AdaBoostRegressor(estimator=LinearRegression(),
                              learning_rate=learning_rate,
                              loss=loss,
                              n_estimators=n_estimators)
    model.fit(dataset.x_train, dataset.y_train)
    return r2_score(dataset.y_test, model.predict(dataset.x_test)),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 1, 1000)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=3)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

population = toolbox.population(n=population_size)

for gen in range(generation_size):
    offspring = algorithms.varAnd(population, toolbox, cxpb=crossing_prob, mutpb=mutation_prob)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit

    population = toolbox.select(offspring, k=len(population))
    best_ind = tools.selBest(population, k=1)[0]
    best_ind_params = calculate_dest_ind_params(best_ind)
    print(f'{gen+1} generation received. Best individual: n_estimators: {best_ind_params[0]}, loss: {best_ind_params[1]}, learning_rate: {best_ind_params[2]}. Fitness: {best_ind.fitness.values[0]}')

1 generation received. Best individual: n_estimators: 868, loss: exponential, learning_rate: 987.6468136822328. Fitness: 0.9961548516567726
2 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.998778247734321
3 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.9987863143181528
4 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.9986471633448014
5 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.9987972382967825
6 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.9987268663862736
7 generation received. Best individual: n_estimators: 54, loss: exponential, learning_rate: 1.710823956187916. Fitness: 0.9988369770209251
8 generation received. Best