In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn import datasets
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

from deap import algorithms
from deap import base
from deap import creator
from deap import tools

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
x_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('X_test.csv')

In [4]:
for i in x_train.keys():
    x_train[i].fillna(x_train[i].mean(), inplace = True)

In [5]:
x = x_train.drop(['id','x629','x193','x339','x297'], axis=1)

In [6]:
y = y_train.drop(['id'], axis=1)

In [7]:
# normalization
scaler = StandardScaler()
Xs = scaler.fit_transform(x)
dfs = pd.DataFrame(Xs, columns=x.keys())

In [8]:
data = datasets.load_boston()

scaler = StandardScaler()

number_of_variables = x.shape[1]
input_variables = x.values

seed = 1234
np.random.seed(seed)

kfolds = KFold(10,shuffle=True,random_state=seed)

In [9]:
def EvaluateFeatureSubsetSingleObjective(individual):
    selected_columns = []
    for i,allele in enumerate(individual):
        if (allele==1):
            selected_columns.append(x.columns[i])

    model = linear_model.LinearRegression()
    scores = cross_val_score(model, x[selected_columns], y, cv=kfolds)
    return scores.mean()

In [10]:
def HillClimbing(number_of_variables,number_of_evaluations,evaluation_function):

    # current evaluation
    evaluations = 0
    
    # start from a random set of features
    current_feature_subset = [random.randint(0,1) for x in range(number_of_variables)]

    # that will also provide an initial evaluation of the best performance
    best_performance = evaluation_function(current_feature_subset)
    
    print("%5d\t\t%3.2f\t%s"%(evaluations,best_performance,str(current_feature_subset)))
    
    # continue until all the evaluations have been performed
    while evaluations<number_of_evaluations:
        
        # generate a neighbor candidate using a 10% perturbation of the current subset
        perturbation = [(lambda x: 1-x if (random.random()<0.1) else x)(x) for x in current_feature_subset]

        # evaluate only if there is at least one variable
        if (sum(perturbation)>0):
            performance = evaluation_function(perturbation)

            if (performance>best_performance):
                best_performance = performance
                current_feature_subset = perturbation

        evaluations = evaluations + 1
        print("%5d\t\t%3.2f\t%s"%(evaluations,best_performance,str(current_feature_subset)))

    print("Best Feature Subset = %s "%(str(current_feature_subset)))
    print("Performance = %3.2f"%(best_performance))

In [16]:
HillClimbing(number_of_variables,23,EvaluateFeatureSubsetSingleObjective)

    0		0.08	[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 