# TP 2 Organización de Datos

***

**Imports**

In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xgb

**Carga de datos**

In [2]:
train_values = pd.read_csv('data/train_values.csv', index_col='building_id')
train_labels = pd.read_csv('data/train_labels.csv', index_col='building_id')
test_values = pd.read_csv('data/test_values.csv', index_col='building_id')
submission_format  = pd.read_csv('data/submission_format.csv', index_col='building_id')
#SACO EL FEATURE QUE NO APORTA 'land_surface_condition' y/o 'has_secondary_use'
#train_values.drop(columns=['land_surface_condition'],inplace=True)
#test_values.drop(columns=['land_surface_condition'],inplace=True)

In [3]:
# INFO PREVIA AL FE
train_values.info()
#test_values.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  int64 
 1   geo_level_2_id                          260601 non-null  int64 
 2   geo_level_3_id                          260601 non-null  int64 
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-

**Funciones**

In [4]:
def submitear(predictions):
    my_submission = pd.DataFrame(data=predictions,
                                 columns=submission_format.columns,
                                 index=submission_format.index)
    my_submission.to_csv('submission.csv')

In [5]:
# FILTRO DE AGE 995
def filtrarAge(df_values):
    
    values = df_values
    #REEMPLAZO POR UN VALOR NO TAN GRANDE (no cambia mucho)
    values.loc[values['age']==995,'age'] = (df_values['age'].max() + 50)
    
    return values

In [6]:
def calcularF1(predictor,X_test,y_test):
    y_pred = predictor.predict(X_test)
    return f1_score(y_test, y_pred, average='micro')

In [7]:
#d1,d2=exp_dato_cat('foundation_type',['h','i','r','u','w'])
#d1,d2

**Feature Engineering**

In [91]:
values_onehot = pd.get_dummies(train_values.iloc[:,:14])
values_onehot.info() #.sum().nsmallest(25).to_frame().rename(columns={0:"Sumatoria"})
#values_onehot.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 41 columns):
 #   Column                    Non-Null Count   Dtype
---  ------                    --------------   -----
 0   geo_level_1_id            260601 non-null  int64
 1   geo_level_2_id            260601 non-null  int64
 2   geo_level_3_id            260601 non-null  int64
 3   count_floors_pre_eq       260601 non-null  int64
 4   age                       260601 non-null  int64
 5   area_percentage           260601 non-null  int64
 6   height_percentage         260601 non-null  int64
 7   land_surface_condition_n  260601 non-null  uint8
 8   land_surface_condition_o  260601 non-null  uint8
 9   land_surface_condition_t  260601 non-null  uint8
 10  foundation_type_h         260601 non-null  uint8
 11  foundation_type_i         260601 non-null  uint8
 12  foundation_type_r         260601 non-null  uint8
 13  foundation_type_u         260601 non-null  uint8
 14  foundation_type

In [9]:
# FEATURE VULNERABILIDAD ESTRUCTURA CIMIENTO
# nota: usar después del encoding
MULT = 10
CF_CIM_I = -0.35 * MULT #* np.log10(221) 
CF_CIM_W = -0.25 * MULT #* np.log10(1495) 
CF_CIM_U = -0.12 * MULT #* np.log10(2030) 
CF_CIM_H = 0.02 * MULT #* np.log10(511) 
CF_CIM_R = 0.37 * MULT #* np.log10(82961) 


def calcularVulCim(df, nombre):
    df[nombre] = 1 + df['foundation_type_h'] * CF_CIM_H   \
            + df['foundation_type_i'] * CF_CIM_I  \
            + df['foundation_type_r'] * CF_CIM_R  \
            + df['foundation_type_u'] * CF_CIM_U  \
            + df['foundation_type_w'] * CF_CIM_W 

In [10]:
# FEATURE ESTUDIO DE LA CONFIGURACION ANTE SISMOS 

CF_PLAN_CONFIGURATION_M = 0.10 * MULT
CF_PLAN_CONFIGURATION_C = 0.10 * MULT
CF_PLAN_CONFIGURATION_A = 0.13 * MULT
CF_PLAN_CONFIGURATION_U = 0.14 * MULT
CF_PLAN_CONFIGURATION_O = 0.15 * MULT
CF_PLAN_CONFIGURATION_S = 0.20 * MULT
CF_PLAN_CONFIGURATION_F = 0.25 * MULT
CF_PLAN_CONFIGURATION_N = 0.30 * MULT
CF_PLAN_CONFIGURATION_D = 0.32 * MULT
CF_PLAN_CONFIGURATION_Q = 0.45 * MULT

def calcularVulPlan(df, nombre):
    df[nombre] = df['plan_configuration_c'] * CF_PLAN_CONFIGURATION_C   \
            + df['plan_configuration_d'] * CF_PLAN_CONFIGURATION_D  \
            + df['plan_configuration_f'] * CF_PLAN_CONFIGURATION_F  \
            + df['plan_configuration_m'] * CF_PLAN_CONFIGURATION_M   \
            + df['plan_configuration_a'] * CF_PLAN_CONFIGURATION_A   \
            + df['plan_configuration_n'] * CF_PLAN_CONFIGURATION_N   \
            + df['plan_configuration_o'] * CF_PLAN_CONFIGURATION_O \
             + df['plan_configuration_q'] * CF_PLAN_CONFIGURATION_Q   \
            + df['plan_configuration_s'] * CF_PLAN_CONFIGURATION_S   \
            + df['plan_configuration_u'] * CF_PLAN_CONFIGURATION_U


In [11]:
# FEATURE ESTUDIO DE LA CONDICIÓN DE LA TIERRA DONDE ES EDIFICADO EL EDIFICO (NO APORTA)
# nota: usar después del encoding
CF_LAND_SURFACE_CONDITION_N = 30
CF_LAND_SURFACE_CONDITION_O = 30
CF_LAND_SURFACE_CONDITION_T = 30


def calcularVulTierra(df, nombre):
    df[nombre] = df['land_surface_condition_n'] * CF_LAND_SURFACE_CONDITION_N   \
            + df['land_surface_condition_o'] * CF_LAND_SURFACE_CONDITION_O   \
            + df['land_surface_condition_t'] * CF_LAND_SURFACE_CONDITION_T

In [12]:
#VULNERABILIDAD USOS 
U_AGRICULTURA = 0.33 * MULT
U_POLICIA = 0.32 * MULT
U_INDUSTRIA = 0.25 * MULT
U_OTROS = 0.22 * MULT
U_ESCUELA = 0.10 * MULT
U_HOTEL = -0.10 * MULT
U_GUBERNAMENTAL = -0.18 * MULT
U_SALUD = -0.25 * MULT
U_RENTA = -0.32 * MULT
U_INSTITUCION = -0.34 * MULT

def calcularVulUso(df,nombre):
    df[nombre] = 1 +  df['has_secondary_use_agriculture'] * U_AGRICULTURA  \
    + df['has_secondary_use_hotel'] * U_HOTEL  \
    + df['has_secondary_use_rental'] * U_RENTA  \
    + df['has_secondary_use_institution'] * U_INSTITUCION  \
    + df['has_secondary_use_school'] * U_ESCUELA  \
    + df['has_secondary_use_industry'] * U_INDUSTRIA  \
    + df['has_secondary_use_health_post'] * U_SALUD  \
    + df['has_secondary_use_gov_office'] * U_GUBERNAMENTAL \
    + df['has_secondary_use_use_police'] * U_POLICIA  \
    + df['has_secondary_use_other'] * U_OTROS

In [13]:
# FEATURE VULNERABILIDAD ESTADO PROPIEDAD
# nota: usar después del encoding
CF_ESTADO_A = 0.18 * MULT
CF_ESTADO_V = 0.35 * MULT
CF_ESTADO_R = 0.38 * MULT
CF_ESTADO_W = 0.50 * MULT


def calcularVulEstado(df, nombre):
    df[nombre] = df['legal_ownership_status_a'] * CF_ESTADO_A   \
            + df['legal_ownership_status_v'] * CF_ESTADO_V  \
            + df['legal_ownership_status_r'] * CF_ESTADO_R  \
            + df['legal_ownership_status_w'] * CF_ESTADO_W 

In [14]:
# FEATURE VULNERABILIDAD ESTRUCTURA OTROS PISOS
# nota: usar después del encoding
CF_O_S = 0.08 * MULT
CF_O_J = 0.28 * MULT
CF_O_Q = 0.36 * MULT
CF_O_X = 0.38 * MULT


def calcularVulOtrosPisos(df, nombre):
    df[nombre] = df['other_floor_type_s'] * CF_O_S   \
            + df['other_floor_type_j'] * CF_O_J  \
            + df['other_floor_type_q'] * CF_O_Q  \
            + df['other_floor_type_x'] * CF_O_X  

In [15]:
# FEATURE VULNERABILIDAD ESTRUCTURA PB
# nota: usar después del encoding
CF_V = -0.28 * MULT
CF_M = -0.15 * MULT
CF_Z = -0.08 * MULT
CF_X = 0.25 * MULT
CF_F = 0.38 * MULT


def calcularVulPiso(df, nombre):
    df[nombre] = df['ground_floor_type_f'] * CF_F   \
            + df['ground_floor_type_m'] * CF_M  \
            + df['ground_floor_type_v'] * CF_V  \
            + df['ground_floor_type_x'] * CF_X  \
            + df['ground_floor_type_z'] * CF_Z 

In [16]:
# FEATURE VULNERABILIDAD ESTRUCTURA TECHO
# nota: usar después del encoding
CF_ROOF_Q = 0.38 * MULT
CF_ROOF_N = 0.35 * MULT
CF_ROOF_X = -0.25 * MULT


def calcularVulTecho(df, nombre):
    df[nombre] = df['roof_type_n'] * CF_ROOF_N   \
            + df['roof_type_q'] * CF_ROOF_Q  \
            + df['roof_type_x'] * CF_ROOF_X 

In [17]:
#FEATURE VULNERABILIDAD MATERIALES
CF_LAJA = 0.48 * MULT
CF_BARRO_PIEDRA = 0.38 * MULT
CF_ADOBE = 0.38 * MULT
CF_BARRO_LADRILLO = 0.28 * MULT
CF_MADERA = 0.14 * MULT
CF_OTROS = 0.8 * MULT
CF_BAMBOO = -0.8 * MULT
CF_CEMENTO_PIEDRA = -0.16 * MULT
CF_HORMIGON_NO = -0.24 * MULT
CF_CEMENTO_LADRILLO = -0.32 * MULT
CF_HORMIGON_SI = -0.36 * MULT

def calcularVulMat(df, nombre):
    df[nombre] = 1 + df['has_superstructure_adobe_mud'] * CF_ADOBE  \
            + df['has_superstructure_mud_mortar_stone'] * CF_BARRO_PIEDRA  \
            + df['has_superstructure_stone_flag'] * CF_LAJA  \
            + df['has_superstructure_cement_mortar_stone'] * CF_CEMENTO_PIEDRA  \
            + df['has_superstructure_mud_mortar_brick'] * CF_BARRO_LADRILLO   \
            + df['has_superstructure_cement_mortar_brick'] * CF_CEMENTO_LADRILLO  \
            + df['has_superstructure_timber'] * CF_MADERA  \
            + df['has_superstructure_bamboo'] * CF_BAMBOO  \
            + df['has_superstructure_rc_non_engineered'] * CF_HORMIGON_NO  \
            + df['has_superstructure_rc_engineered'] * CF_HORMIGON_SI  \
            + df['has_superstructure_other'] * CF_OTROS  


In [18]:
def agregarMateriales(features): 
    lista = features
    i = train_values.columns.get_loc("has_superstructure_adobe_mud")
    mats = train_values.columns[i:i+11]    
    lista.extend(mats)
    
    return lista

In [19]:
def agregarUsos(features): 
    lista = features
    i = train_values.columns.get_loc("has_secondary_use_agriculture")
    usos = train_values.columns[i:]
    lista.extend(usos)
    
    return lista

In [20]:
#
VULNERABS = ['vul_mat', 'vul_uso','vul_piso','vul_cim','vul_techo'] # vul_estado vul_plan
#VULNERABS = ['vul_mat','vul_uso','vul_cim']

datosMean = ['geo_level_1_id_mean','geo_level_2_id_mean','geo_level_3_id_mean','age_mean','count_floors_pre_eq_mean']
    

datosCat = ['foundation_type_h','foundation_type_i','foundation_type_r','foundation_type_u','foundation_type_w',  \
            'roof_type_n','roof_type_q','roof_type_x',  \
            'ground_floor_type_f','ground_floor_type_m','ground_floor_type_v','ground_floor_type_x','ground_floor_type_z', \
            'position_j','position_o','position_s','position_t',  \
           'other_floor_type_s','other_floor_type_j','other_floor_type_q','other_floor_type_x']
#SAMPLE_SIZE = len(train_values.index)

    


In [21]:
def agregarDatosMean(features): 
    lista = features
    lista.extend(datosMean)
    
    return lista

In [22]:
def agregarDatosCat(features): 
    lista = features
    lista.extend(datosCat)
    
    return lista

In [23]:
def aplicarIC(df):    
    calcularVulMat(df, 'vul_mat')
    calcularVulUso(df, 'vul_uso')
    
    if 'ground_floor_type_f' in df.columns:
        calcularVulPiso(df, 'vul_piso')
    if 'roof_type_n' in df.columns:
        calcularVulTecho(df, 'vul_techo')
    if 'foundation_type_h' in df.columns:
        calcularVulCim(df, 'vul_cim')
    #if 'plan_configuration_c' in df.columns:
    #    calcularVulPlan(df, 'vul_plan')
    #if 'other_floor_type_s' in df.columns:
    #    calcularVulOtrosPisos(df, 'vul_otros_pisos')
    #if 'legal_ownership_status_a' in df.columns:
    #    calcularVulEstado(df, 'vul_estado')
    #calcularVulTierra(df, 'vul_tierra')
    

In [24]:
def agregarFeatures(features): 
    lista = features
    #vulnerabs = ['vul_mat','vul_uso','vul_piso','vul_techo','vul_cim','vul_plan','vul_otros_pisos','vul_estado']
    
    lista.extend(VULNERABS)
    
    return lista
    

In [25]:
# MEAN ENCODING REEMPLAZO
def meanDato(data, target, dato):
    data_extended = train_values.join(target,how='inner')
    encoding = data_extended.groupby([dato])['damage_grade'].mean().to_dict()
    data[dato] =  data[dato].map(encoding)

In [26]:
# MEAN ENCODING SIN REEMPLAZO
def meanDatoAparte(data, target, dato):
    data_extended = train_values.join(target,how='inner')
    encoding = data_extended.groupby([dato])['damage_grade'].mean().to_dict()
    data[(dato + "_mean")] =  data[dato].map(encoding)

In [27]:

def aplicarMeanEncoding(data, labels):    
    #print("Nada.\n")
    
    meanDatoAparte(data, labels, 'count_floors_pre_eq')
    meanDatoAparte(data, labels, 'age')
    meanDatoAparte(data, labels, 'geo_level_1_id')
    meanDatoAparte(data, labels, 'geo_level_2_id')
    meanDatoAparte(data, labels, 'geo_level_3_id')
    #meanDato(data, labels, 'other_floor_type')
    meanDato(data, labels, 'plan_configuration')

In [28]:
def makeSet(df_data, df_labels, feats=[]):
    
    data = df_data.copy()
    labels = df_labels.copy()
    
    #if len(feats)>0:
    #    data = data[feats]

    #CODIFICACION/MANIPULACION DE LA DATA
    aplicarMeanEncoding(data, labels)
    data = pd.get_dummies(data)
    #FEATURE ENGINEERING
    aplicarIC(data)
      
    #if len(feats)>0:
    #    data = data[agregarFeatures(feats)]
        
    data = data.join(labels, how='inner')
    
    return data.iloc[:,:-1],data.iloc[:,-1]

In [29]:
#X, y = makeSet(train_values, train_labels)
#X
#train_values.head(10)

In [30]:
#train_values.info()
#test_values.loc[test_values['age']==995,:].describe()

In [31]:
train_values_alt = train_values.drop(columns=['land_surface_condition', 'has_secondary_use'])

***

**Random Forest**

In [32]:
#X.info()

In [33]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=0,n_estimators=10,min_samples_leaf=5))

features_forest = [
                    'geo_level_1_id',
                    'geo_level_2_id',
                    'geo_level_3_id',
                    'count_floors_pre_eq',
                    'age'
                      ]


#train_values_alt = train_values #filtrarAge(train_values)

feats = features_forest
#feats = agregarMateriales(feats)
#feats = agregarUsos(feats)
feats = agregarFeatures(feats)
feats = agregarDatosMean(feats)


# ALL FEATURES ON / OFF
#feats = []

#feats = mejores_feats_rf.index.tolist()
X, y = makeSet(train_values_alt, train_labels)

#Recorte de features
X = X[feats]

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.15, random_state=2018) #=2018)

pipe.fit(X_train,y_train)
print("Self score: "+ str(calcularF1(pipe,X_train,y_train)) +  \
               "\nTest score: " + str(calcularF1(pipe,X_test,y_test)) )

Self score: 0.8064240892059049
Test score: 0.7465912869970069


In [34]:
#calcularF1(pipe,X_test,y_test)
# 0.6974022485706611 rndSt 0
# 0.7044242354476037 rndSt 2018
#X['vul_uso'].nsmallest(10)

***

**XGBoost**

In [98]:
GAMMA = 2
N_EST = 36
TOL = 0
ETA = 0.15
MAX_DEPTH = 32
MIN_WEIGHT = 3

In [None]:
# FEATURE IMPORTANCE

xgb_model = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                         use_label_encoder=False,
                         verbosity=0,
                         sampling_method='uniform', #'gradient_based'
                           n_estimators=N_EST,
                         #subsample=1,
                         eta=ETA,
                         max_depth=MAX_DEPTH,
                         min_child_weight=MIN_WEIGHT,
                         gamma=GAMMA,
                         colsample_bytree=0.75,
                         tree_method='exact', #'gpu_hist'
                           num_class=3)



feats =  [
            'geo_level_1_id',
            'geo_level_2_id',
            'geo_level_3_id',
            'age',
            #'count_families',
            'area_percentage',
            'height_percentage',
            #'has_secondary_use',
            #'foundation_type_r',
            #'foundation_type_i',
            #'ground_floor_type_v',
            #'has_superstructure_mud_mortar_stone',
            #'has_superstructure_stone_flag',
            #'has_superstructure_rc_non_engineered',
            #'other_floor_type',
            #'ground_floor_type',
            #'legal_ownership_status',
            #'plan_configuration',
            #'roof_type',
            'count_floors_pre_eq'
            ]


feats = agregarMateriales(feats)
feats = agregarUsos(feats)
feats = agregarDatosCat(feats)
feats = agregarFeatures(feats)
feats = agregarDatosMean(feats)


#CORRECCION PARA EL XGBC (LABEL ENCODING)
train_labels_alt = train_labels.copy()
train_labels_alt['damage_grade']-=1

X, y = makeSet(train_values_alt, train_labels_alt)

#Recorte de features
X = X[feats]
feature_select = X.columns

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.10, random_state=2018) #np.random.randint(1111,9999))

# HIST
# 0.7451075695172802 n 12, drop, vul plan
# 0.7457726842495715 n 12, drop, vul plan estado 

xgb_model.fit(X_train,y_train)
print("Con todos los features:\nSelf score: "+ str(calcularF1(xgb_model,X_train,y_train)) +  \
               "\nTest score: " + str(calcularF1(xgb_model,X_test,y_test)) )

In [None]:
#X, y = makeSet(train_values_alt, train_labels)
featsXGB = pd.DataFrame(xgb_model.feature_importances_)
featsXGB['feature'] = feature_select
mejores_feats = featsXGB.loc[featsXGB[0]>TOL,:].set_index('feature') 

remover = []
for feat in mejores_feats.index:
    if feat in VULNERABS:
        remover.append(feat)

mejores_feats = mejores_feats.drop(remover)
featsXGB.nsmallest(10,0).rename(columns={0:"Importancia"})

In [None]:


xgbc_main = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                         use_label_encoder=False,
                         verbosity=0,
                         sampling_method='uniform', #'gradient_based'
                           n_estimators=N_EST,
                         #subsample=1,
                         eta=ETA,
                         max_depth=MAX_DEPTH,
                         min_child_weight=MIN_WEIGHT,
                         gamma=GAMMA,
                         colsample_bytree=0.75,
                         tree_method='exact', #'gpu_hist'
                           num_class=3)



feats = mejores_feats.index.tolist()
X_xgb, y_xgb = makeSet(train_values_alt, train_labels_alt)

#Recorte de features
#X_xgb = X_xgb[feats]

X_train, X_test, y_train, y_test = \
    train_test_split(X_xgb, y_xgb, test_size=0.10, random_state=2018) #np.random.randint(1111,9999))


xgbc_main.fit(X_train,y_train)

print("Con features seleccionados:\nSelf score: " + str(calcularF1(xgbc_main,X_train,y_train)) +  \
               "\nTest score: " + str(calcularF1(xgbc_main,X_test,y_test)) )

In [39]:
#calcularF1(xgbc,X_test,y_test)
# OLD 0.7257971681823414
# 0.731245923026745 rdnSt 0  -> n_estimators=10,eta=0.05,max_depth=15,min_child_weight=3,gamma=0.4,colsample_bytree=0.7
# 0.7323424829244584 rndSt 2018
# 0.7345168964723338 rndSt 0 -> n_estimators=10,eta=0.1,max_depth=15,min_child_weight=1,gamma=0.4,colsample_bytree=0.7
# 0.7401447903609525 rndSt 0 -> n_estimators=10,eta=0.3,max_depth=15,min_child_weight=1,gamma=0.4,colsample_bytree=0.7
# 0.7408866491008161 rndSt 0 -> n_estimators=50,eta=0.05,max_depth=15,min_child_weight=3,gamma=0.4,colsample_bytree=0.7

# 0.7409539158129005 CON 100
# 0.7451747822416638 CON 100

# 0.7458238469212862 rndSt 0 -> n_estimators=50,eta=0.15,max_depth=15,min_child_weight=3,gamma=0.4,colsample_bytree=0.7
# 0.7462075669591466 rndSt 0 -> n_estimators=50,eta=0.2,max_depth=15,min_child_weight=3,gamma=0.4,colsample_bytree=0.7
# 0.7479726791333043 rndSt 0 -> n_estimators=60,eta=0.2,max_depth=15,min_child_weight=3,gamma=0.4,colsample_bytree=0.7

# 0.7475122150878719 elegidos, n_estimators=30,#subsample=1,eta=0.15,max_depth=24,min_child_weight=7,gamma=2,colsample_bytree=0.75
# 0.7495075592847459 igual que arriba con 60 
# 0.7492773272620297 con 120 (empeoró)

# 0.7470517510424395 todos los feats, n=30, gamma=2
# 0.7478191911181602 feats>5e-3, n=30, gamma=2
# 0.7478191911181602 todos, n=60, gamma=2
# 0.7485610498580236 feats>3e-3, n=60, gamma=2

# 0.7501215113453225 feats>2e-3, n=60, gamma=2, max_depth=32, min_child_weight=3,
# 0.750735463405899 con el drop, todos los feats, n 60
# 0.7508377887493285 feats>0

# 0.7464889616535776 gpu_hist, max depth 15
# 0.7480494231408764 con 120 
# 0.7468215190197232 method='gradient_based',n_estimators=120,eta=0.15,max_depth=15,min_child_weight=7,gamma=0,colsample_bytree=0.75,tree_method='gpu_hist'

***

**Submission**

In [96]:
# TEST SET
def predecir(model, dataset):
    
    feats = mejores_feats.index.tolist()
    test_values_subset = dataset.drop(columns=['land_surface_condition', 'has_secondary_use'])
    #CODIFICACION/MANIPULACION DE LA DATA
    #CORRECCION PARA EL XGBC (LABEL ENCODING)
    labels_alt = train_labels.copy()
    labels_alt['damage_grade']-=1
    aplicarMeanEncoding(test_values_subset, train_labels_alt)
    test_values_subset = pd.get_dummies(test_values_subset)
    #FEATURE ENGINEERING
    aplicarIC(test_values_subset)
    data = test_values_subset #[feats]
    preds = model.predict(data)
    #CORRECION XGBC (LABEL DECODING)
    preds+=1
    my_submission = pd.DataFrame(data=preds,
                                 columns=submission_format.columns,
                                 index=dataset.index)
    return my_submission
    #test_values_subset.info()

In [97]:
# Para la submission conviene entrenar con todo el train_values para mejor resultado (sin hacer el split)
# Acordarse de corregir para el xgb (para knn no)
xgbc_main.fit(X_xgb, y_xgb)
preds=predecir(xgbc_main, test_values)
submitear(preds)

In [42]:
#predecir(xgbc, test_values_subset)

In [43]:
#my_submission.index

In [44]:
# INFO DESPUES DEL FE
X.head(10)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,age,area_percentage,height_percentage,count_floors_pre_eq,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,vul_mat,vul_uso,vul_piso,vul_cim,vul_techo,geo_level_1_id_mean,geo_level_2_id_mean,geo_level_3_id_mean,age_mean,count_floors_pre_eq_mean
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,30,6,5,2,1,1,0,...,8.6,1.0,3.8,4.7,3.5,1.161724,1.740741,1.837838,1.349678,1.239231
28830,8,900,2812,10,8,7,2,0,1,0,...,4.8,1.0,2.5,4.7,3.5,1.485273,1.487437,1.0625,1.200689,1.239231
94947,21,363,8973,10,5,5,2,0,1,0,...,4.8,1.0,3.8,4.7,3.5,1.563369,1.51875,1.580882,1.200689,1.239231
590882,22,418,10694,10,6,5,2,0,1,0,...,-1.8,1.0,3.8,4.7,3.5,1.00096,1.107317,1.096774,1.200689,1.239231
201944,11,131,1488,30,8,9,3,1,0,0,...,4.8,1.0,3.8,4.7,3.5,1.337713,1.348748,1.368852,1.349678,1.396713
333020,8,558,6089,10,9,5,2,0,1,0,...,4.8,4.3,3.8,4.7,3.5,1.485273,1.546539,1.368421,1.200689,1.239231
728451,9,475,12066,25,3,4,2,0,1,0,...,4.8,1.0,2.5,4.7,3.5,1.026023,1.137681,1.555556,1.329599,1.239231
475515,20,323,12236,0,8,6,2,0,0,0,...,-0.8,1.0,-2.8,-1.5,3.8,0.926464,0.282869,0.1,0.957529,1.239231
441126,0,757,7219,15,8,6,2,0,1,0,...,6.2,1.0,3.8,4.7,3.8,1.06532,1.105085,1.189189,1.263455,1.239231
989500,26,886,994,0,13,4,1,0,0,0,...,-2.2,1.0,-2.8,-2.5,3.5,0.730887,0.545109,0.551402,0.957529,1.04216


In [45]:
#historico.nlargest(45,0)

In [46]:
#infoDF.nsmallest(25,0)

In [47]:
## Hyper Parameter Optimization

def entrenarXGB():
    feats = mejores_feats.index.tolist()
    X, y = makeSet(train_values_alt, train_labels_alt)

    #Recorte de features
    #X = X[feats]

    X_train, X_test, y_train, y_test = \
        train_test_split(X_xgb, y_xgb, test_size=0.15, random_state=2018) #np.random.randint(1111,9999))


    grid_cv.fit(X_train,y_train)

    return grid_cv.cv_results_

In [48]:
#ESTO SE PUEDE MODIFICAR (PROBAR BUCLE MANUAL)
params_grid={  "eta"              : [ 0.05, 0.2 ],
                 "max_depth"        : [ 12 ],
                 "min_child_weight" : [ 3, 5, 7 ],
                 "gamma"            : [ 2 ],
                 "colsample_bytree" : [ 0.7 ]
                }





model = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                         use_label_encoder=False,
                         verbosity=0,
                         sampling_method='uniform', #'gradient_based'
                           n_estimators=N_EST*4,
                         #subsample=1,                          
                         tree_method='exact', #'gpu_hist'
                           num_class=3)


grid_cv = GridSearchCV(model, params_grid, cv=5)

#entrenarXGB()

In [49]:
knn = KNeighborsClassifier(n_neighbors = 8,metric = "minkowski",p = 1)
features_knn = [
                    'geo_level_1_id',
                    'geo_level_2_id',
                    'geo_level_3_id',
                    #'age',
                    # 'count_families',
                    #'area_percentage',
                    #'height_percentage',
                    #'has_secondary_use',
                    'count_floors_pre_eq'
                      ]

#train_values_knn = train_values #filtrarAge(train_values)

feats = features_knn
feats = agregarMateriales(feats)
#feats = agregarUsos(feats)
# ALL FEATURES ON / OFF
#feats = []
#feats = mejores_feats.tolist()
X_knn, y_knn = makeSet(train_values, train_labels, feats) 

X_train, X_test, y_train, y_test = \
    train_test_split(X_knn, y_knn, test_size=0.15, random_state=0) #np.random.randint(1111,9999))


knn.fit(X_train,y_train)

#calcularF1(knn,X_test,y_test)

KNeighborsClassifier(n_neighbors=8, p=1)

In [50]:
#calcularF1(knn,X_test,y_test)

In [51]:
#y_pred = classifier.predict(test_values_subset)
#my_submission = pd.DataFrame(data=y_pred,
#                             columns=submission_format.columns,
#                             index=submission_format.index)
#my_submission.to_csv('submission.csv')

In [52]:
#my_submission.head()

In [53]:
#preds = xgbc.predict(test_values_subset)
#CORRECION XGBC
#preds+=1
#submitear(preds)

In [54]:
#predictions = xg_reg.predict(test_values_subset)
#predictions=predictions+1

In [55]:
params={
 "learning_rate"    : [ ],
 "max_depth"        : [ ],
 "min_child_weight" : [ ],
 "gamma"            : [ ],
 "colsample_bytree" : [ ]
    
}

In [56]:
def iterarXGB_eta(time):
    
    xgbc_graph = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                             use_label_encoder=False,
                             verbosity=0,
                             sampling_method='uniform', #'gradient_based'
                               n_estimators=36,
                             #subsample=1,
                             eta=time,
                             max_depth=MAX_DEPTH,
                             min_child_weight=9,
                             gamma=GAMMA,
                             colsample_bytree=0.75,
                             tree_method='exact', #'gpu_hist'
                               num_class=3)



    feats = mejores_feats.index.tolist()
    X, y = makeSet(train_values_alt, train_labels_alt)

    #Recorte de features
    #X_xgb = X_xgb[feats]

    X_train, X_test, y_train, y_test = \
        train_test_split(X_xgb, y_xgb, test_size=0.10, random_state=2018) #np.random.randint(1111,9999))


    xgbc_graph.fit(X_train,y_train)

    return calcularF1(xgbc_graph,X_train,y_train),calcularF1(xgbc_graph,X_test,y_test)

In [57]:
def iterarXGB_minw(weight):
    
    xgbc_graph = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                             use_label_encoder=False,
                             verbosity=0,
                             sampling_method='uniform', #'gradient_based'
                               n_estimators=36,
                             #subsample=1,
                             eta=ETA,
                             max_depth=MAX_DEPTH,
                             min_child_weight=weight,
                             gamma=GAMMA,
                             colsample_bytree=0.75,
                             tree_method='exact', #'gpu_hist'
                               num_class=3)



    feats = mejores_feats.index.tolist()
    X, y = makeSet(train_values_alt, train_labels_alt)

    #Recorte de features
    #X_xgb = X_xgb[feats]

    X_train, X_test, y_train, y_test = \
        train_test_split(X_xgb, y_xgb, test_size=0.10, random_state=2018) #np.random.randint(1111,9999))


    xgbc_graph.fit(X_train,y_train)

    return calcularF1(xgbc_graph,X_train,y_train),calcularF1(xgbc_graph,X_test,y_test)

In [58]:
def iterarXGB_est(n):
    
    xgbc_graph = xgb.XGBClassifier(objective='multi:softmax', eval_metric='auc',
                             use_label_encoder=False,
                             verbosity=0,
                             sampling_method='uniform', #'gradient_based'
                               n_estimators=n,
                             #subsample=1,
                             eta=ETA,
                             max_depth=MAX_DEPTH,
                             min_child_weight=MIN_WEIGHT,
                             gamma=GAMMA,
                             colsample_bytree=0.75,
                             tree_method='exact', #'gpu_hist'
                               num_class=3)



    feats = mejores_feats.index.tolist()
    X, y = makeSet(train_values_alt, train_labels_alt)

    #Recorte de features
    #X_xgb = X_xgb[feats]

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.10, random_state=2018) #np.random.randint(1111,9999))


    xgbc_graph.fit(X_train,y_train)

    return calcularF1(xgbc_graph,X_train,y_train),calcularF1(xgbc_graph,X_test,y_test)

In [61]:
#x=[]
#y1_list=[]
#y2_list=[]
#for i in range(12,96,12):
#    y1,y2 = iterarXGB_est(i)
#    y1_list.append(y1)
#    y2_list.append(y2)
#    x.append(i)
    
#plt.title("XGBC: F1 score vs n_estimators")
#plt.ylabel("F1 score")
#plt.xlabel("n_estimators")
#plt.grid()
#plt.ylim(0.6,0.9) 
#plt.plot(x,y1_list,label='self score')
#plt.plot(x,y2_list,label='test score')
    

In [None]:
#valores = [1,5,9]
#x_minw=[]
#y1_list_minw=[]
#y2_list_minw=[]
#for valor in valores:
#    y1,y2 = iterarXGB_minw(valor)
#    y1_list_minw.append(y1)
#    y2_list_minw.append(y2)
#    x_minw.append(valor)
    
#plt.title("XGBC: F1 score vs min_weight")
#plt.ylabel("F1 score")
#plt.xlabel("min_weight")
#plt.grid()
#plt.ylim(0.6,0.9) 
#plt.plot(x_minw,y1_list_minw,label='self score')
#plt.plot(x_minw,y2_list_minw,label='test score')
    

In [None]:
#valores = [0.05,0.15,0.3]
#x_eta=[]
#y1_list_eta=[]
#y2_list_eta=[]
#for valor in valores:
#    y1,y2 = iterarXGB_eta(valor)
#    y1_list_eta.append(y1)
#    y2_list_eta.append(y2)
#    x_eta.append(valor)
    
#plt.title("XGBC: F1 score vs eta")
#plt.ylabel("F1 score")
#plt.xlabel("eta")
#plt.grid()
#plt.ylim(0.6,0.9) 
#plt.plot(x_eta,y1_list_eta,label='self score')
#plt.plot(x_eta,y2_list_eta,label='test score')
    

In [60]:
sum_feats = X.copy()
#sum_feats.sum().nsmallest(25).to_frame().rename(columns={0:"Sumatoria"})
sum_feats.info()
#values_onehot.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 59 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   geo_level_1_id                          260601 non-null  int64  
 1   geo_level_2_id                          260601 non-null  int64  
 2   geo_level_3_id                          260601 non-null  int64  
 3   age                                     260601 non-null  int64  
 4   area_percentage                         260601 non-null  int64  
 5   height_percentage                       260601 non-null  int64  
 6   count_floors_pre_eq                     260601 non-null  int64  
 7   has_superstructure_adobe_mud            260601 non-null  int64  
 8   has_superstructure_mud_mortar_stone     260601 non-null  int64  
 9   has_superstructure_stone_flag           260601 non-null  int64  
 10  has_superstructure_cement_mortar_stone 

In [66]:
coef_techos = [CF_ROOF_Q,CF_ROOF_N,CF_ROOF_X]
coef_usos=[U_AGRICULTURA,U_POLICIA ,U_INDUSTRIA ,U_OTROS ,U_ESCUELA,U_HOTEL ,U_GUBERNAMENTAL ,U_SALUD ,U_RENTA,U_INSTITUCION]
coef_mats=[CF_LAJA,CF_BARRO_PIEDRA ,CF_ADOBE ,CF_BARRO_LADRILLO ,CF_MADERA ,CF_OTROS ,CF_BAMBOO ,CF_CEMENTO_PIEDRA,CF_HORMIGON_NO,CF_CEMENTO_LADRILLO,CF_HORMIGON_SI]
coef_cim=[CF_CIM_I ,CF_CIM_W ,CF_CIM_U ,CF_CIM_H ,CF_CIM_R]
coef_pisos=[CF_V ,CF_M,CF_Z ,CF_X ,CF_F]
tipos_techos = sum_feats.columns[33:36]
tipos_usos = sum_feats.columns[18:28]
tipos_mats = sum_feats.columns[7:18]
tipos_cim = sum_feats.columns[28:33]
tipos_pisos = sum_feats.columns[36:41]
tabla_techos = pd.DataFrame(data=coef_techos,index=tipos_techos).rename(columns={0:'peso'})
tabla_techos

Unnamed: 0,peso
roof_type_n,3.8
roof_type_q,3.5
roof_type_x,-2.5


In [78]:
tabla_usos = pd.DataFrame(data=coef_usos,index=tipos_usos).rename(columns={0:'peso'})
#tabla_usos.index=tabla_usos.index.rename('feature')
tabla_usos

Unnamed: 0,peso
has_secondary_use_agriculture,3.3
has_secondary_use_hotel,3.2
has_secondary_use_rental,2.5
has_secondary_use_institution,2.2
has_secondary_use_school,1.0
has_secondary_use_industry,-1.0
has_secondary_use_health_post,-1.8
has_secondary_use_gov_office,-2.5
has_secondary_use_use_police,-3.2
has_secondary_use_other,-3.4


In [79]:
tabla_cim = pd.DataFrame(data=coef_cim,index=tipos_cim).rename(columns={0:'peso'})
#tabla_usos.index=tabla_usos.index.rename('feature')
tabla_cim

Unnamed: 0,peso
foundation_type_h,-3.5
foundation_type_i,-2.5
foundation_type_r,-1.2
foundation_type_u,0.2
foundation_type_w,3.7


In [80]:
tabla_pisos = pd.DataFrame(data=coef_pisos,index=tipos_pisos).rename(columns={0:'peso'})
#tabla_usos.index=tabla_usos.index.rename('feature')
tabla_pisos

Unnamed: 0,peso
ground_floor_type_f,-2.8
ground_floor_type_m,-1.5
ground_floor_type_v,-0.8
ground_floor_type_x,2.5
ground_floor_type_z,3.8


In [81]:
tabla_mats = pd.DataFrame(data=coef_mats,index=tipos_mats).rename(columns={0:'peso'})
#tabla_usos.index=tabla_usos.index.rename('feature')
tabla_mats

Unnamed: 0,peso
has_superstructure_adobe_mud,4.8
has_superstructure_mud_mortar_stone,3.8
has_superstructure_stone_flag,3.8
has_superstructure_cement_mortar_stone,2.8
has_superstructure_mud_mortar_brick,1.4
has_superstructure_cement_mortar_brick,8.0
has_superstructure_timber,-8.0
has_superstructure_bamboo,-1.6
has_superstructure_rc_non_engineered,-2.4
has_superstructure_rc_engineered,-3.2
