In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
def load_aug_data():
    names = np.array(['X' + str(i+1) for i in range(86)]) # Hace una lista de nombres de las variables que van desde X1 hasta X86
    data = pd.read_csv('../data/ticdata2000.txt', delimiter = "\t",header=None, names=names) # Le pone esos nombre en el header al data frame

    X_train = data.iloc[:,0:85]  # Se seleccionan todas las variables a excepción de la variable respuesta que es la última
    y_train = data.iloc[:, 85] # Se seleccion unicamente la variable respuesta

    X_eval = pd.read_csv('../data/ticeval2000.txt', delimiter = "\t",header=None, names=names[:-1])
    y_eval = pd.read_csv('../data/tictgts2000.txt', delimiter = "\t",header=None, names=['X86'])
    
    vars_to_change = [9,18,22,23,29,31,35,36]
    new_vars = np.array(['X'+str(i+1)+'_2' for i in vars_to_change])
    names = np.concatenate((names[:-1], new_vars))
    class_reducer = {0:1, 1:1, 2:2, 3:2, 4:3, 5:3, 6:4, 7:4, 8:5, 9:5}

    # Add new columns to X_train
    X_train = pd.concat([X_train, X_train.iloc[:,vars_to_change]], axis=1) # Se copian las columnas que vamos a cambiar al final del dataframe
    X_train.columns = names

    # Reduce 10 classes to 5
    for i in range(len(vars_to_change)):
        X_train.iloc[:,i+85] = X_train.iloc[:,i+85].replace(class_reducer)

    # Same for X_eval
    X_eval = pd.concat([X_eval, X_eval.iloc[:,vars_to_change]], axis=1) # Copy columns to mutate
    X_eval.columns = names

    # Reduce 10 classes to 5
    for i in range(len(vars_to_change)):
        X_eval.iloc[:,i+85] = X_eval.iloc[:,i+85].replace(class_reducer)
        
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_eval = np.array(X_eval)
    y_eval = np.array(y_eval)
    
    oh = OneHotEncoder(drop=[4], sparse=False, categories=[list(range(1,11))])
    dummies_train = oh.fit_transform(X_train[:,4].reshape(-1, 1))
    dummies_eval = oh.transform(X_eval[:,4].reshape(-1, 1))
    X_train = np.concatenate((X_train, dummies_train), axis=1)
    X_eval = np.concatenate((X_eval, dummies_eval), axis=1)
    X_5_kept_cats = [1,2,3,5,6,7,8,9,10]
    
    names = np.concatenate((names, ['X5_2_' + str(i) for i in X_5_kept_cats]))

    return X_train, y_train, X_eval, y_eval, names

In [3]:
X, y, X_eval, y_eval, feature_names = load_aug_data()

In [4]:
useful_num_vars = [43, 46, 53, 58, 60, 63]
useful_zip_vars = [4, 17, 85, 86, 88, 89, 90, 91, 92]
X_5 = list(range(93,102))

In [5]:
# Sets_of features to try
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

pre_feature_sets = list(powerset([useful_num_vars, useful_zip_vars, X_5]))
feature_sets_names = list(powerset(['useful_num_vars', 'useful_zip_vars', 'X_5']))

feature_sets = []
for _set in pre_feature_sets:
    compact_set = []
    for l in _set:
        compact_set += l
    feature_sets.append(compact_set)

In [8]:
def run_model(Model, model_vars, parameters={}, verbose=False):
    X_train_subset, X_test_subset, X_eval_subset = prep_X_input(model_vars) # Subset parent datasets with the columns to use
    # Model
    mod = Model(**parameters).fit(X_train_subset, y_train)
    
    
    # Predict and show test metrics
    if (Model == LinearRegression):
        probs_test = mod.predict(X_test_subset)
        probs_eval = mod.predict(X_eval_subset)
    else:
        probs_test = mod.predict_proba(X_test_subset)[:,1]
        probs_eval = mod.predict_proba(X_eval_subset)[:,1]
    
    print("Utilizando la de test")
    recall_test, svm_test = model_metrics(probs_test, y_test)
    print("Utilizando la de evaluación")
    recall_eval, svm_eval = model_metrics(probs_eval, y_eval)
    
    return (recall_test, svm_test, recall_eval, svm_eval)

array([[33.,  1.,  3., ...,  1.,  0.,  0.],
       [37.,  1.,  2., ...,  1.,  0.,  0.],
       [37.,  1.,  2., ...,  1.,  0.,  0.],
       ...,
       [33.,  1.,  3., ...,  1.,  0.,  0.],
       [34.,  1.,  3., ...,  1.,  0.,  0.],
       [33.,  1.,  3., ...,  1.,  0.,  0.]])