# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning
# Feature Selection

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

Fuentes:

* http://people.stat.sfu.ca/~lockhart/richard/350/08_2/lectures/VariableSelection/web.pdf

In [None]:
def get_feature_selection():
    # Se esconde en un def para poder importarlo

    return {

        'best_features_progresivo': [],
        
        'best_features_forward': [],
  
        'best_features_backward' : [],
        
        'best_features_stepwise' : [] 

}

In [None]:
import nbimporter # pip install nbimporter
import pandas as pd
from parameter_tuning import get_hiper_params
import submission_framework as SF

df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

seed=42
hiper_params = get_hiper_params()

df_users.head()

---
---
---


Usando Random Forest, el algoritmo más estable de los definidos (XGBoost es poco estable) encontramos que combinación de features es la mas favorable (con la métrica Area Under Curve).

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_name = 'random_forest'
params = hiper_params[model_name]
model = RandomForestClassifier(**params,random_state=seed)
random_forest = (model_name,model)

SF.full_framework_wrapper(df_users,df_y,random_forest)

In [None]:
full_features = df_users.columns.tolist()
cantidad_features = len(full_features)

tried = {}

## Cumulative Importance

Se parte de una lista de todos los features ordeandos segun importancia, y se genera una lista de listas acumulativa de esto. Es decir de `[a,b,c]` se pasa a `[ [a], [a,b], [a,b,c] ]`

Esto se hace porque se esta buscando el 'codo': Los features que hacen que incremente el AUC.

In [None]:
feature_importances = SF.get_feature_importances(df_users,df_y,random_forest)
features_ordenados = feature_importances.index.tolist()
lista_progresiva_de_cols = [features_ordenados[:i] for i in range(1,len(features_ordenados))]

model, max_auc = SF.full_framework_wrapper(df_users,df_y,random_forest)
best_features_progresivo = features_ordenados
features_con_saltos_progresivo = []

for i, cols in enumerate(lista_progresiva_de_cols):
    print(f'\n\nIteración {i+1} de {len(lista_progresiva_de_cols)}\n\n')
    if tuple(cols) not in tried:
        model, auc = SF.full_framework_wrapper(df_users,df_y,random_forest,columns=cols,verbosity=1)
        tried[tuple(cols)] = auc
    else:
        auc = tried[tuple(cols)]
    if auc > max_auc + 0.0001:
        max_auc = auc
        best_features_progresivo = cols
        features_con_saltos_progresivo.append(cols[-1])

In [None]:
best_features_progresivo

##  Forward Selection

Se parte de una lista vacía y se van agregando todos los features uno por uno.

In [None]:
def step_forward(features_usadas, features_sin_usar):
    if not features_sin_usar: 
        return SF.full_framework_wrapper(df_users,df_y,random_forest,columns=features_usadas,verbosity=1)[1], None
    max_auc = 0
    features_a_usar = None
    la_posta = None
    for f in features_sin_usar:
        features_a_probar = features_usadas + [f]
        if tuple(features_a_probar) not in tried: 
            model, auc = SF.full_framework_wrapper(df_users,df_y,random_forest,columns=features_a_probar,verbosity=1)
            tried[tuple(features_a_probar)] = auc
        else: 
            auc = tried[tuple(features_a_probar)]
        if auc > max_auc + 0.0001:
            la_posta = f
            max_auc = auc
            
    return max_auc, la_posta

In [None]:
best_features_forward = None
max_auc = 0
features_usadas = []
features_sin_usar = full_features[:]

for i in range(cantidad_features):
    print(f'\n\nIteración {i+1} de {cantidad_features}')
    print(f'Lo mejor al momento: {best_features_forward} (AUC: {max_auc:.4f}) \n\n')
    auc, feature_a_agregar = step_forward(features_usadas, features_sin_usar)
    features_usadas.append(feature_a_agregar)
    features_sin_usar.remove(feature_a_agregar)
    if auc > max_auc + 0.0001:
        best_features_forward = features_usadas[:]
        max_auc = auc   

In [None]:
best_features_forward

##  Backward Elimination

Se parte de una lista con todos los features y se van sacando uno por uno, en busqueda de cual hace que incremente un AUC una vez que se lo remueva.

In [None]:
def step_backward(features):
    if len(features) == 1: 
        print('atroden')
        return SF.full_framework_wrapper(df_users,df_y,random_forest,columns=features,verbosity=1)[1], None
    max_auc = 0
    features_a_usar = None
    la_posta = None
    for f in features:
        features_a_probar = features[:]
        features_a_probar.remove(f)
        if tuple(features_a_probar) not in tried:
            model, auc = SF.full_framework_wrapper(df_users,df_y,random_forest,columns=features_a_probar,verbosity=1)
            tried[tuple(features_a_probar)] = auc
        else:
            auc = tried[tuple(features_a_probar)]
        if auc > max_auc + 0.0001:
            la_posta = f
            max_auc = auc
                
    return max_auc, la_posta

In [None]:
features = full_features[:]
best_features_backward = features[:]
max_auc = SF.full_framework_wrapper(df_users,df_y,random_forest,columns=best_features_backward)[1]

for i in range(1,cantidad_features):
    print(f'\n\nIteración {i} de {cantidad_features-1}')
    print(f'Lo mejor al momento: {best_features_backward} (AUC: {max_auc:.4f}) \n\n')
    auc, feature_a_borrar = step_backward(features)
    features.remove(feature_a_borrar)
    if auc > max_auc + 0.0001:
        best_features_backward = features[:]
        max_auc = auc

In [None]:
best_features_backward

##  Stepwise Regression

Combinación de Forward Selection y Backward Elimination

In [None]:
best_features_stepwise = []
max_auc = 0
features_usadas = []
features_sin_usar = full_features[:]

for i in range(cantidad_features):
    print(f'\n\nIteración {i+1} de {cantidad_features}')
    print(f'Lo mejor al momento: {best_features_stepwise} (AUC: {max_auc:.4f}) \n\n')
    auc_loc, feature_a_agregar = step_forward(features_usadas, features_sin_usar)
    features_usadas.append(feature_a_agregar)
    features_sin_usar.remove(feature_a_agregar)
    features_to_stepwisear = features_usadas[:]
    iteraciones = len(features_to_stepwisear)
    
    print(f'\nStepWise: {features_to_stepwisear}\n')
    for i in range(1,iteraciones):
        auc, feature_a_borrar = step_backward(features_to_stepwisear)
        features_to_stepwisear.remove(feature_a_borrar)
        if auc > auc_loc:
            features_usadas = features_to_stepwisear[:]
            auc_loc = auc

    if auc_loc > max_auc:
        best_features_stepwise = features_usadas[:]
        max_auc = auc_loc   

In [None]:
best_features_stepwise