In [133]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_predict, LeaveOneOut
from sklearn.cross_validation import KFold
from itertools import combinations, chain
%matplotlib inline


Реализация методов отбора признаков Add (последовательное добавление признаков), Delete (последовательное удаление признаков), AddDelete (комбинации методов Add и Del) 

Используется линейная регрессия из библиотеки sklearn и совпадение ошибок в качестве критерия остановки, потому что и так все, что реализовано, работает за разумное время :

In [182]:
def Add(X, y, cv, added_signs = set(), mistake = float("inf")):        
    X_dim = X.shape[1]
    all_signs = set(range(X_dim))
    
    while len(added_signs) < X_dim:
        index_to_add = None
        
        for i in (all_signs - added_signs):
            added_signs.add(i)
            
            lin_regression = LinearRegression()
            y_predicted = cross_val_predict(lin_regression, X[:, np.array(list(added_signs))], y = y, cv = cv)
            new_mistake = mean_squared_error(y_predicted, y)
            
            if new_mistake < mistake:
                mistake = new_mistake
                index_to_add = i
                
            added_signs.remove(i)
            
        if index_to_add is None:
            break
            
        added_signs.add(index_to_add)
        
    return mistake, added_signs


In [183]:
def Delete(X, y, cv, added_signs = set(), mistake = float("inf")):
    X_dim = X.shape[1]
    
    if len(added_signs) == 0:
        added_signs = set(range(X_dim))
    
    while len(added_signs) >= 2:
        index_to_delete = None
        
        for i in added_signs:
            added_signs.remove(i)
            
            lin_regression = LinearRegression()
            y_predicted = cross_val_predict(lin_regression, X[:, np.array(list(added_signs))], y = y, cv = cv)
            new_mistake = mean_squared_error(y_predicted, y)
            
            if new_mistake < mistake:
                mistake = new_mistake
                index_to_delete = i
                
            added_signs.add(i)
            
        if index_to_delete is None:
            break
            
        added_signs.remove(index_to_delete)
        
    return mistake, added_signs

In [188]:
def AddDelete(X, y, cv):
    added_signs = set()
    mistake = float("inf")
    
    while True:
        new_mistake, added_signs = Add(X, y, cv, added_signs = added_signs, mistake = mistake)
        new_mistake, added_signs = Delete(X, y, cv, added_signs = added_signs, mistake = new_mistake)
        
        if mistake == new_mistake:
            break
            
        mistake = new_mistake
            
    return mistake, added_signs

Реализация полного перебора для получения оптимального набора признаков :

In [185]:
def FullSearch(X, y, cv):
    X_dim = X.shape[1]
    mistake = float("inf")
    
    result_signs = list()
    all_subsets = list(chain(*[combinations(range(X_dim), i) for i in range(1, X_dim + 1)]))
    
    for subset in all_subsets:
        lin_regression = LinearRegression()
        y_predicted = cross_val_predict(lin_regression, X[:, np.array(list(subset))], y = y, cv = cv)
        new_mistake = mean_squared_error(y_predicted, y)
        
        if new_mistake < mistake:
            mistake = new_mistake
            result_signs = subset
            
    return mistake, set(result_signs)

Загрузка данных :

In [186]:
boston = datasets.load_boston()

Запуск методов с критерием контроля по 20 блокам (sklearn.cross_validation.KFold) :

In [181]:
kf = KFold(len(boston.target), n_folds = 20)

print Add(boston.data, boston.target, kf)
print Delete(boston.data, boston.target, kf)
print AddDelete(boston.data, boston.target, kf)
print FullSearch(boston.data, boston.target, kf)

(29.563348682882047, set([1, 3, 4, 5, 7, 10, 11, 12]))
(29.53752708017085, set([0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12]))
(29.563348682882047, set([1, 3, 4, 5, 7, 10, 11, 12]))
(29.53752708017085, set([0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12]))


Запуск методов с критерием Leave-one-out (sklearn.cross_validation.LeaveOneOut) :

In [187]:
loo = LeaveOneOut(len(boston.target))

print Add(boston.data, boston.target, loo)
print Delete(boston.data, boston.target, loo)
print AddDelete(boston.data, boston.target, loo)
print FullSearch(boston.data, boston.target, loo)

(23.516933914117438, set([0, 1, 3, 4, 5, 7, 8, 9, 10, 11, 12]))
(23.516933914117438, set([0, 1, 3, 4, 5, 7, 8, 9, 10, 11, 12]))
(23.516933914117438, set([0, 1, 3, 4, 5, 7, 8, 9, 10, 11, 12]))
(23.516933914117438, set([0, 1, 3, 4, 5, 7, 8, 9, 10, 11, 12]))
