In [23]:
# encoding=utf8

# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [24]:
#Carga de datos
data_values = pd.read_csv('data/train_values.csv')

data_labels = pd.read_csv('data/train_labels.csv')

In [25]:
#Analisis de 'data'
data_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [26]:
data_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
building_id                               260601 non-null int64
geo_level_1_id                            260601 non-null int64
geo_level_2_id                            260601 non-null int64
geo_level_3_id                            260601 non-null int64
count_floors_pre_eq                       260601 non-null int64
age                                       260601 non-null int64
area_percentage                           260601 non-null int64
height_percentage                         260601 non-null int64
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non

In [27]:
data_values.shape

(260601, 39)

In [28]:
data_values.size

10163439

In [29]:
data_values_object_columns=data_values.loc[:, data_values.dtypes == object]
data_values_object_columns.head()

Unnamed: 0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,legal_ownership_status
0,t,r,n,f,q,t,d,v
1,o,r,n,x,q,s,d,v
2,t,r,n,f,x,t,d,v
3,t,r,n,f,x,s,d,v
4,t,r,n,f,x,s,d,v


In [30]:
data_values_object_columns.describe(include='all').loc['unique', :]

land_surface_condition     3
foundation_type            5
roof_type                  3
ground_floor_type          5
other_floor_type           4
position                   4
plan_configuration        10
legal_ownership_status     4
Name: unique, dtype: object

In [31]:
#Mostrar todos los valores unicos del tipo objecto
for col in list(data_values_object_columns):
    print(col)
    print(np.sort(data_values_object_columns[col].unique()))
    print("-----------")

land_surface_condition
['n' 'o' 't']
-----------
foundation_type
['h' 'i' 'r' 'u' 'w']
-----------
roof_type
['n' 'q' 'x']
-----------
ground_floor_type
['f' 'm' 'v' 'x' 'z']
-----------
other_floor_type
['j' 'q' 's' 'x']
-----------
position
['j' 'o' 's' 't']
-----------
plan_configuration
['a' 'c' 'd' 'f' 'm' 'n' 'o' 'q' 's' 'u']
-----------
legal_ownership_status
['a' 'r' 'v' 'w']
-----------


In [32]:
#Mostrar todos los tipos internos del tipo objeto
for col in list(data_values_object_columns):
    print(col)
    print(data_values_object_columns[col].map(lambda x:  type(x)).unique())
    print("-----------")


land_surface_condition
[<class 'str'>]
-----------
foundation_type
[<class 'str'>]
-----------
roof_type
[<class 'str'>]
-----------
ground_floor_type
[<class 'str'>]
-----------
other_floor_type
[<class 'str'>]
-----------
position
[<class 'str'>]
-----------
plan_configuration
[<class 'str'>]
-----------
legal_ownership_status
[<class 'str'>]
-----------


In [33]:
#Cantidad de Nan en los datos
data_values.isna().sum().sum()

0

In [37]:
#Transformamos los tipos int64 a booleanos para reducir espacio y aplicar el tipo correspondiente
boolean_columns = [col for col in data_values if col.startswith('has_')]

for col in list(boolean_columns):
    data_values[col] = data_values[col].astype(bool)

data_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
building_id                               260601 non-null int64
geo_level_1_id                            260601 non-null int64
geo_level_2_id                            260601 non-null int64
geo_level_3_id                            260601 non-null int64
count_floors_pre_eq                       260601 non-null int64
age                                       260601 non-null int64
area_percentage                           260601 non-null int64
height_percentage                         260601 non-null int64
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non

In [None]:
#NOTA: Uso de memoria reducida a la mitad

In [11]:
#Analisis de 'data_labels'
data_labels = pd.read_csv('data/train_labels.csv')
data_labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [12]:
data_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
building_id     260601 non-null int64
damage_grade    260601 non-null int64
dtypes: int64(2)
memory usage: 4.0 MB


In [15]:
data_labels.shape

(260601, 2)

In [16]:
data_labels.size

521202

In [17]:
#Cantidad de Nan en los datos
data_labels.isna().sum().sum()

0

In [None]:
data = data_values.merge(left=data_values, right=data_labels, how='outer', left_on='building_id', right_on='building_id')

In [39]:
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,False,False,False,False,False,False,False,False,False,False
1,28830,8,900,2812,2,10,8,7,o,r,...,False,False,False,False,False,False,False,False,False,False
2,94947,21,363,8973,2,10,5,5,t,r,...,False,False,False,False,False,False,False,False,False,False
3,590882,22,418,10694,2,10,6,5,t,r,...,False,False,False,False,False,False,False,False,False,False
4,201944,11,131,1488,3,30,8,9,t,r,...,False,False,False,False,False,False,False,False,False,False


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
building_id                               260601 non-null int64
geo_level_1_id                            260601 non-null int64
geo_level_2_id                            260601 non-null int64
geo_level_3_id                            260601 non-null int64
count_floors_pre_eq                       260601 non-null int64
age                                       260601 non-null int64
area_percentage                           260601 non-null int64
height_percentage                         260601 non-null int64
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non

In [41]:
#Cantidad de Nan en los datos
data.isna().sum().sum()

0

In [None]:
#Correlaciones entre columnas

