In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt

In [2]:
# Training data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

building_structure = pd.read_csv('Building_Structure.csv')
building_ownership = pd.read_csv('Building_Ownership_Use.csv')
additional_info = pd.merge(building_structure, building_ownership, on='building_id')

app_train = pd.merge(train,additional_info,on="building_id")
app_test = pd.merge(test,additional_info,on="building_id")

submissionId = pd.DataFrame({'building_id' : app_test['building_id']})

app_train = app_train.drop("building_id", axis=1)
app_test = app_test.drop("building_id", axis=1)

print('Training data shape: ', app_train.shape)
app_train.head()

Training data shape:  (631761, 57)


Unnamed: 0,area_assesed,damage_grade,district_id,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_flood,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_liquefaction,has_geotechnical_risk_other,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,Both,Grade 4,24,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Both,Grade 2,44,0.0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Both,Grade 1,36,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Building removed,Grade 5,30,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Both,Grade 3,36,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:

print('Testing data shape: ', app_test.shape)
app_test.head()

Testing data shape:  (421175, 56)


Unnamed: 0,area_assesed,district_id,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_flood,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_liquefaction,has_geotechnical_risk_other,has_geotechnical_risk_rock_fall,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Building removed,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Building removed,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
app_train['damage_grade'].value_counts()

Grade 5    210825
Grade 4    152244
Grade 3    122288
Grade 2     85084
Grade 1     61320
Name: damage_grade, dtype: int64

In [8]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [9]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

Your selected dataframe has 57 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
has_repair_started,33417,5.3
count_families,1,0.0


In [10]:
app_train = app_train.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [11]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

Your selected dataframe has 57 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [12]:
missing_values = missing_values_table(app_test)
missing_values.head(20)

Your selected dataframe has 56 columns.
There are 1 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
has_repair_started,21922,5.2


In [13]:
app_test = app_test.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [14]:
missing_values = missing_values_table(app_test)
missing_values.head(20)

Your selected dataframe has 56 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [15]:
print('Training data shape: ', app_train.shape)
print('Testing data shape: ', app_test.shape)

Training data shape:  (631761, 57)
Testing data shape:  (421175, 56)


In [16]:
le = LabelEncoder()
le_count = 0

Enc_ohe = OneHotEncoder()
ohe_count = 0

col_list = []

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
        elif col!='damage_grade' and col!='building_id':
            print(col)
            col_list.append(col)
            ohe_count += 1
            
print('%d columns were label encoded.' % le_count)
print('%d columns were one hot encoded.' % ohe_count)

area_assesed
land_surface_condition
foundation_type
roof_type
ground_floor_type
other_floor_type
position
plan_configuration
condition_post_eq
legal_ownership_status
0 columns were label encoded.
10 columns were one hot encoded.


In [17]:
app_train["damage_grade"] = le.fit_transform(app_train["damage_grade"])
df_dummy = pd.DataFrame(Enc_ohe.fit_transform(app_train[["damage_grade"]]).todense(), columns = le.classes_)
#app_train = pd.concat([app_train, df_dummy], axis=1)

In [18]:
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (631761, 57)
Testing Features shape:  (421175, 56)


In [19]:
app_train.head()

Unnamed: 0,area_assesed,damage_grade,district_id,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_flood,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_liquefaction,has_geotechnical_risk_other,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,Both,3,24,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Both,1,44,0.0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Both,0,36,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Building removed,4,30,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Both,2,36,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
app_test.head()

Unnamed: 0,area_assesed,district_id,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_flood,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_liquefaction,has_geotechnical_risk_other,has_geotechnical_risk_rock_fall,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Building removed,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Both,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Building removed,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
app_train.dtypes.value_counts()

int64      43
object     10
float64     4
dtype: int64

In [27]:
# Find correlations with the target and sort
correlations = app_train.corr()

correlations['damage_grade']


damage_grade                              1.000000
district_id                              -0.074476
has_geotechnical_risk                     0.086293
has_geotechnical_risk_fault_crack         0.098152
has_geotechnical_risk_flood              -0.021316
has_geotechnical_risk_land_settlement     0.057551
has_geotechnical_risk_landslide           0.063065
has_geotechnical_risk_liquefaction        0.001484
has_geotechnical_risk_other              -0.002173
has_geotechnical_risk_rock_fall           0.058279
has_repair_started                       -0.151237
vdcmun_id                                -0.073473
district_id_x                            -0.074476
vdcmun_id_x                              -0.073473
ward_id_x                                -0.073472
count_floors_pre_eq                       0.173777
count_floors_post_eq                     -0.592553
age_building                              0.037243
plinth_area_sq_ft                        -0.144655
height_ft_pre_eq               

In [28]:
# Find correlations with the target and sort-OneHotEncoding
correlations = app_train.corr()['damage_grade'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(25))
print('\nMost Negative Correlations:\n', correlations.head(25))

Most Positive Correlations:
 has_superstructure_other                -0.031685
has_secondary_use_other                 -0.022153
has_geotechnical_risk_flood             -0.021316
has_secondary_use_school                -0.018036
has_secondary_use_industry              -0.010701
has_secondary_use_gov_office            -0.009534
has_secondary_use_health_post           -0.008811
has_secondary_use_use_police            -0.002591
has_geotechnical_risk_other             -0.002173
has_geotechnical_risk_liquefaction       0.001484
has_secondary_use_agriculture            0.026161
age_building                             0.037243
has_superstructure_mud_mortar_brick      0.049749
has_superstructure_stone_flag            0.056319
has_geotechnical_risk_land_settlement    0.057551
has_geotechnical_risk_rock_fall          0.058279
has_geotechnical_risk_landslide          0.063065
count_families                           0.066489
height_ft_pre_eq                         0.079180
has_geotechnical_risk

In [31]:
train_labels = df_dummy

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
#app_train['damage_grade'] = train_labels

In [32]:
def onehot(df, col_list):
    for col in col_list:
       dummies = pd.get_dummies(df[col], prefix)
       df = df.join(dummies)
       df = df.drop([col], axis=1)
       print(dummies.head())
       return df

In [33]:
features_test = app_test[['area_assesed', 'has_superstructure_mud_mortar_stone', 'other_floor_type', 'count_floors_pre_eq',
                         'has_geotechnical_risk_fault_crack', 'has_superstructure_adobe_mud', 'has_geotechnical_risk',
                         'height_ft_pre_eq', 'has_geotechnical_risk_landslide', 'has_geotechnical_risk_rock_fall',
                         'has_geotechnical_risk_land_settlement', 'plan_configuration', 'age_building', 'land_surface_condition',
                         'foundation_type', 'height_ft_post_eq', 'count_floors_post_eq', 'ground_floor_type', 
                         'has_superstructure_cement_mortar_brick', 'has_repair_started', 'roof_type', 'condition_post_eq']]
                         
features_train, features_test = app_train.align(features_test, join = 'inner', axis = 1)
#features_train['damage_grade'] = train_labels
target = list(set(col_list).intersection(set(list(features_train.columns.values))))
#features_train = onehot(features_train, target)
#features_test = onehot(features_test, target)

features_train = pd.get_dummies(features_train, prefix=target, columns=target)
features_test = pd.get_dummies(features_test, prefix=target, columns=target)

print(features_train.shape)

(631761, 57)


In [34]:
len(target)

8

In [35]:
target

['area_assesed',
 'ground_floor_type',
 'roof_type',
 'other_floor_type',
 'foundation_type',
 'plan_configuration',
 'condition_post_eq',
 'land_surface_condition']

In [36]:
list(features_train.columns.values)

['has_geotechnical_risk',
 'has_geotechnical_risk_fault_crack',
 'has_geotechnical_risk_land_settlement',
 'has_geotechnical_risk_landslide',
 'has_geotechnical_risk_rock_fall',
 'has_repair_started',
 'count_floors_pre_eq',
 'count_floors_post_eq',
 'age_building',
 'height_ft_pre_eq',
 'height_ft_post_eq',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_cement_mortar_brick',
 'area_assesed_Both',
 'area_assesed_Building removed',
 'area_assesed_Exterior',
 'area_assesed_Interior',
 'area_assesed_Not able to inspect',
 'ground_floor_type_Brick/Stone',
 'ground_floor_type_Mud',
 'ground_floor_type_Other',
 'ground_floor_type_RC',
 'ground_floor_type_Timber',
 'roof_type_Bamboo/Timber-Heavy roof',
 'roof_type_Bamboo/Timber-Light roof',
 'roof_type_RCC/RB/RBC',
 'other_floor_type_Not applicable',
 'other_floor_type_RCC/RB/RBC',
 'other_floor_type_TImber/Bamboo-Mud',
 'other_floor_type_Timber-Planck',
 'foundation_type_Bamboo/Timber',
 'founda

In [37]:
features_train.head()

Unnamed: 0,has_geotechnical_risk,has_geotechnical_risk_fault_crack,has_geotechnical_risk_land_settlement,has_geotechnical_risk_landslide,has_geotechnical_risk_rock_fall,has_repair_started,count_floors_pre_eq,count_floors_post_eq,age_building,height_ft_pre_eq,...,condition_post_eq_Damaged-Not used,condition_post_eq_Damaged-Repaired and used,condition_post_eq_Damaged-Rubble Clear-New building built,condition_post_eq_Damaged-Rubble clear,condition_post_eq_Damaged-Rubble unclear,condition_post_eq_Damaged-Used in risk,condition_post_eq_Not damaged,land_surface_condition_Flat,land_surface_condition_Moderate slope,land_surface_condition_Steep slope
0,0.0,0,0,0,0,0.0,3,3,21,20,...,1,0,0,0,0,0,0,1,0,0
1,0.0,0,0,0,0,1.0,3,3,33,24,...,0,1,0,0,0,0,0,1,0,0
2,0.0,0,0,0,0,0.0,3,3,2,21,...,0,0,0,0,0,0,1,0,0,1
3,0.0,0,0,0,0,0.0,3,0,3,20,...,0,0,0,1,0,0,0,1,0,0
4,0.0,0,0,0,0,0.0,2,2,22,18,...,0,0,0,0,0,1,0,1,0,0


In [38]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

Using TensorFlow backend.


In [39]:
from sklearn.model_selection import train_test_split

In [40]:
#train - validation split
x_train, x_validation, y_train, y_validation = train_test_split(features_train, train_labels, test_size=0.2, random_state=1)

In [41]:
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 22-dimensional vectors.
model.add(Dense(64, activation='sigmoid', input_dim=57))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(5, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=100,
          batch_size=128)
score = model.evaluate(x_validation, y_validation, batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [42]:
score 

[0.62521523359312414, 0.71418961164592865]

In [46]:
#predict model on testing data set
y_prediction = model.predict(features_test, verbose=1)



In [47]:
y_prediction

array([[  9.30388197e-02,   3.73551548e-01,   3.88114244e-01,
          1.43318906e-01,   1.97652471e-03],
       [  2.10991758e-26,   1.05806629e-21,   3.17018934e-19,
          3.02242958e-13,   1.00000000e+00],
       [  1.40554710e-24,   5.45448419e-22,   2.01978170e-19,
          5.90616206e-13,   1.00000000e+00],
       ..., 
       [  9.99847293e-01,   1.52661160e-04,   3.27133911e-08,
          1.06427723e-11,   3.39492115e-11],
       [  1.67910199e-26,   7.52261697e-22,   5.21377472e-19,
          5.70824692e-13,   1.00000000e+00],
       [  1.18937754e-25,   4.71520768e-22,   2.73751701e-19,
          4.66785681e-13,   1.00000000e+00]], dtype=float32)

In [48]:
labels = np.argmax(y_prediction, axis=-1)    
print(labels)

[2 4 4 ..., 0 4 4]


In [49]:
labels_decoded = le.inverse_transform(labels)

In [50]:

submission = pd.concat([submissionId, pd.DataFrame(labels_decoded, columns = ['damage_grade'])], axis=1)
submission.to_csv('submission.csv', index=False)

In [51]:
subm1 = pd.read_csv('submission.csv')
subm2 = pd.read_csv('sample_submission.csv')
print(subm1.shape)
print(subm2.shape)

(421175, 2)
(421175, 2)
