# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning

# Submission Framework

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

En este notebook se definen unas simples funciones de I/O para armar las postulaciones de predicciones del trabajo práctico.

Uso:

1. Crea la matriz `X` y el vector `y` para entrenar

2. Split para generar los set de entrenamiento y de prueba

3. Se ejecuta el algoritmo de ML (debe devolver un dataframe con person en el indice y labels como unica columna)

4. Se ve la precisión de la predicción

5. Se obtiene la métrica AUC (Area Under the Receiving Operating Characteristic Curve)

6. Se predicen las probabilidades

7. Se ve información relevante de la ejecución

8. Se guarda como un csv

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn import preprocessing
import numpy as np
import pandas as pd
import os.path

df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

In [None]:
def assert_equals(x,y):
    if not (x==y): 
        msg = f'{x} no equivale a {y}'
        print(msg)
        return False
    return True

def df_label_xor(df1, df2):
    merged = df1.merge(df2, how='outer', left_index=True, right_index=True, indicator=True)
    merged = merged.query('_merge != "both"')
    return merged

In [None]:
def fr1_extract_X_y(df, df_y, normalize=False):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df.merge(df_y, how='inner', left_index= True, right_index=True)
    if not assert_equals(len(data), 19414): return
    
    X = data.drop('label', axis=1).values
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        X_scaled = min_max_scaler.fit_transform(X)
        X = X_scaled
        
    y = df_y.values
    
    return X, y


def fr2_train_test_split(X, y, seed, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size,
                                                        stratify=y,
                                                        random_state=seed)
    return X_train, X_test, y_train, y_test


def fr4_metric_score(X_test, y_test, model, model_name):
    y_pred = model.predict(X_test)
    if not assert_equals(len(y_test), len(y_pred)): return

    accuracy = accuracy_score(y_test, y_pred.round())    
    auc = make_scorer(roc_auc_score, needs_threshold=True)(model, X_test, y_test)

    return accuracy, auc


def fr5_extract_X_to_predict(df, df_y, model):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df_label_xor(df, df_y)
    data = data.drop(['label', '_merge'], axis=1)

    if not assert_equals(len(data), 19415): return

    predictions = model.predict_proba(data.values)
    
    predictions_list = []
    for i in predictions:
        predictions_list.append(i[1])
    predictions_final = np.array(predictions_list)
        
    return data, predictions_final


def fr6_print_information(df, model, X_to_predict, with_features_importance):
    if not with_features_importance:
        return None
    
    feature_importances = pd.DataFrame(model.feature_importances_,
                                  index=df.columns,
                                  columns=['importance'])
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    return feature_importances
    
    
def fr7_train_final_model(algorithm, X, y):
    return algorithm.fit(X, y)
    
    
def fr8_to_csv(df, predictions, model_name, accuracy):
    submission = df
    submission['label'] = predictions
    submission = submission['label']
    
    if not assert_equals(len(submission), 19415): return
    
    name_csv = f'submission-{model_name}-{accuracy:.4f}.csv'
    submission.to_csv(name_csv, header=True)
    return name_csv, submission

In [None]:
def full_framework_wrapper(model_name, model_function, 
                           columns=df_users.columns.tolist(), normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, all_in=False, submit=False,n_ensamble=0):
    
    model_df_x = df_users[columns]
    model_df_y = df_y
    
    if n_ensamble: model_name+=f'_ensamble_{n_ensamble}'
    if all_in: model_name+='_all_in'
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)

    if not n_ensamble == 0:
        
        total_predictions = 0
        tmp_seed = seed
        accuracy, auc = 0,0
        for i in range(n_ensamble):
            print(f'Iteración {i+1} de ensamble de {n_ensamble}')
            tmp_seed = tmp_seed + i
            model = model_function(X_train, y_train, tmp_seed)
            accuracy_tmp, auc_tmp = fr4_metric_score(X_test, y_test, model, model_name)
            accuracy += accuracy_tmp
            auc += auc_tmp
            if all_in: model = model_function(X, y, tmp_seed)            
            X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
            total_predictions = total_predictions + predictions

        predictions = total_predictions / n_ensamble
        accuracy /= n_ensamble
        auc /= n_ensamble

        for i in range(len(predictions)):
            if predictions[i] < 0:
                print("WARNING: prediction[{}] = {}".format(i, predictions[i]))
                predictions[i] = 0   
        
        if submit:
            print('asd')
            csv_name, submission = fr8_to_csv(X_to_predict, predictions, model_name, auc)
            display(csv_name)
            message = f"{model_name} - {model.get_params()} - {columns}"
            display(message)
            return model, auc, csv_name,message
        
        return model, auc
    
    model = model_function(X_train, y_train, seed)
    accuracy, auc = fr4_metric_score(X_test, y_test, model, model_name)
    
    if all_in:
        model = model_function(X, y, seed)
    
    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
       
    if verbosity>=1: print('Model: {} - Columns: {} (AUC: {:.4f})'.format(model_name, columns, auc))  
    elif verbosity >=0: print('Model: {} - Accuracy: {:.4f} - AUC: {:.4f}'.format(model_name, accuracy, auc))  
    
    if submit:
        csv_name, submission = fr8_to_csv(X_to_predict, predictions, model_name, auc)
        display(csv_name)
        message = f"{model_name} - {model.get_params()} - {columns}"
        display(message)
        return model, auc, csv_name, message
    
    return model, auc

In [None]:
def get_feature_importances(model_name, model_function, 
                            columns=df_users.columns.tolist(), normalize=False,
                            test_size=0.34, seed=42):
    
    model_df_x = df_users[columns]
    model_df_y = df_y
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
    model = model_function(X_train, y_train, seed)
    accuracy, auc = fr4_metric_score(X_test, y_test, model, model_name)
    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
       
    return fr6_print_information(model_df_x, model, X_to_predict, True)

def get_full_features():
    return df_users.columns.tolist()