# Encoding personalizado

Vamos a reemplazar las variables categoricas por un valor que le pongamos nosotros. Nuestra idea es, hacer algo parecido al encoding, que seria reemplazar por un valor numerico el valor categorico pero agregandole un poco mas de informacion, para ver si esto tiene alguna influencia positiva en nuestros modelos.

Vamos a reemplazar el valor categorico por el promedio de daño que sufrieron los edificios que tienen ese valor. En cada paso verificaremos que todos sean valores distintos, para que no los iguale. En caso de que se produzca una colisión, tendremos que utilizar algun método para diferenciarlos.

Nuestras variables categóricas son:
- land_surface_condition
- foundation_type
- roof_type
- ground_floor_type
- other_floor_type
- position
- plan_configuration
- legal_ownership_status

In [173]:
import pandas as pd
import numpy as np
import seaborn as sns
import squarify # pip install squarify
import matplotlib.pyplot as plt
%matplotlib inline

# suprimimos la notacion cientifica en los outputs
pd.options.display.float_format = '{:20,.2f}'.format

labelsDF = pd.read_csv('train_labels.csv')
valuesDF = pd.read_csv('train_engineered_features.csv')
test_values = pd.read_csv('test_engineered_features.csv', index_col='building_id')

valuesJoined = valuesDF.merge(labelsDF, how='inner', on='building_id')

## land_surface_condition

In [174]:
group = valuesJoined[['land_surface_condition','damage_grade']].groupby('land_surface_condition').mean()
val = group.index, group.values
def mean_land_surface_condition(x):
    for i in range(len(val[0])):
        if x['land_surface_condition'] == val[0][i]:
            return val[1][i][0]

In [175]:
group

Unnamed: 0_level_0,damage_grade
land_surface_condition,Unnamed: 1_level_1
n,2.25
o,2.29
t,2.23


In [176]:
valuesJoined['land_surface_condition'] = valuesJoined.apply(lambda x: mean_land_surface_condition(x), axis=1)
test_values['land_surface_condition'] = test_values.apply(lambda x: mean_land_surface_condition(x), axis=1)

# foundation_type

In [177]:
group = valuesJoined[['foundation_type','damage_grade']].groupby('foundation_type').mean()
val = group.index, group.values
def mean_foundation_type(x):
    for i in range(len(val[0])):
        if x['foundation_type'] == val[0][i]:
            return val[1][i][0]

In [178]:
group

Unnamed: 0_level_0,damage_grade
foundation_type,Unnamed: 1_level_1
h,2.11
i,1.45
r,2.33
u,1.88
w,1.81


In [179]:
valuesJoined['foundation_type'] = valuesJoined.apply(lambda x: mean_foundation_type(x), axis=1)
test_values['foundation_type'] = test_values.apply(lambda x: mean_foundation_type(x), axis=1)

## roof_type

In [180]:
group = valuesJoined[['roof_type','damage_grade']].groupby('roof_type').mean()
val = group.index, group.values
def mean_roof_type(x):
    for i in range(len(val[0])):
        if x['roof_type'] == val[0][i]:
            return val[1][i][0]

In [181]:
group

Unnamed: 0_level_0,damage_grade
roof_type,Unnamed: 1_level_1
n,2.27
q,2.32
x,1.57


In [182]:
valuesJoined['roof_type'] = valuesJoined.apply(lambda x: mean_roof_type(x), axis=1)
test_values['roof_type'] = test_values.apply(lambda x: mean_roof_type(x), axis=1)

## ground_floor_type

In [183]:
group = valuesJoined[['ground_floor_type','damage_grade']].groupby('ground_floor_type').mean()
val = group.index, group.values
def mean_ground_floor_type(x):
    for i in range(len(val[0])):
        if x['ground_floor_type'] == val[0][i]:
            return val[1][i][0]

In [184]:
group

Unnamed: 0_level_0,damage_grade
ground_floor_type,Unnamed: 1_level_1
f,2.31
m,1.97
v,1.63
x,2.25
z,2.07


In [185]:
valuesJoined['ground_floor_type'] = valuesJoined.apply(lambda x: mean_ground_floor_type(x), axis=1)
test_values['ground_floor_type'] = test_values.apply(lambda x: mean_ground_floor_type(x), axis=1)

# other_floor_type

In [186]:
group = valuesJoined[['other_floor_type','damage_grade']].groupby('other_floor_type').mean()
val = group.index, group.values
def mean_other_floor_type(x):
    for i in range(len(val[0])):
        if x['other_floor_type'] == val[0][i]:
            return val[1][i][0]

In [187]:
group

Unnamed: 0_level_0,damage_grade
other_floor_type,Unnamed: 1_level_1
j,2.04
q,2.32
s,1.61
x,2.3


In [188]:
valuesJoined['other_floor_type'] = valuesJoined.apply(lambda x: mean_other_floor_type(x), axis=1)
test_values['other_floor_type'] = test_values.apply(lambda x: mean_other_floor_type(x), axis=1)

# position

In [189]:
group = valuesJoined[['position','damage_grade']].groupby('position').mean()
val = group.index, group.values
def mean_position(x):
    for i in range(len(val[0])):
        if x['position'] == val[0][i]:
            return val[1][i][0]

In [190]:
group

Unnamed: 0_level_0,damage_grade
position,Unnamed: 1_level_1
j,2.15
o,2.21
s,2.23
t,2.31


In [191]:
valuesJoined['position'] = valuesJoined.apply(lambda x: mean_position(x), axis=1)
test_values['position'] = test_values.apply(lambda x: mean_position(x), axis=1)

# plan_configuration

In [192]:
group = valuesJoined[['plan_configuration','damage_grade']].groupby('plan_configuration').mean()
val = group.index, group.values
def mean_plan_configuration(x):
    for i in range(len(val[0])):
        if x['plan_configuration'] == val[0][i]:
            return val[1][i][0]

In [193]:
group

Unnamed: 0_level_0,damage_grade
plan_configuration,Unnamed: 1_level_1
a,1.85
c,1.84
d,2.24
f,2.27
m,1.91
n,2.16
o,1.89
q,2.27
s,2.05
u,1.92


In [194]:
valuesJoined['plan_configuration'] = valuesJoined.apply(lambda x: mean_plan_configuration(x), axis=1)
test_values['plan_configuration'] = test_values.apply(lambda x: mean_plan_configuration(x), axis=1)

# legal_ownership_status

In [195]:
group = valuesJoined[['legal_ownership_status','damage_grade']].groupby('legal_ownership_status').mean()
val = group.index, group.values
def mean_legal_ownership_status(x):
    for i in range(len(val[0])):
        if x['legal_ownership_status'] == val[0][i]:
            return val[1][i][0]

In [196]:
group

Unnamed: 0_level_0,damage_grade
legal_ownership_status,Unnamed: 1_level_1
a,1.89
r,2.22
v,2.24
w,2.42


In [197]:
valuesJoined['legal_ownership_status'] = valuesJoined.apply(lambda x: mean_legal_ownership_status(x), axis=1)
test_values['legal_ownership_status'] = test_values.apply(lambda x: mean_legal_ownership_status(x), axis=1)

# Descargamos los csv
Esta celda siempre va al final

In [None]:
valuesJoined.drop('damage_grade', axis=1)
valuesJoined.to_csv('train_encoded_features.csv')
test_values.to_csv('test_encoded_features.csv')