In [214]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.optim import SGD
import torch.utils.data as Data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt

# **Read the Data**

In [215]:
train_y = pd.read_csv("train_labels.csv", index_col='building_id')
train_x = pd.read_csv("train_values.csv", index_col='building_id')
test_x = pd.read_csv("test_values.csv", index_col='building_id')

# **Feature Engineering- ANN**

In [216]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]]).to_numpy()

In [217]:
geo1 = torch.tensor(geo1, dtype = torch.float)
geo2 = torch.tensor(geo2, dtype = torch.float)
geo3 = torch.tensor(geo3, dtype = torch.long)

In [218]:
geo1_train, geo1_valid, geo2_train, geo2_valid, geo3_train, geo3_valid = train_test_split(geo1, geo2, geo3, test_size = 0.2)

In [219]:
train_dataset = Data.TensorDataset(geo1_train, geo2_train, geo3_train)
train_loader = Data.DataLoader(dataset = train_dataset, batch_size = 64, shuffle = True)

**Training**

In [220]:
class EmbedNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(12568, 16)
        self.layer1 = nn.Sequential(nn.Linear(16, 31), nn.Sigmoid())
        self.layer2 = nn.Sequential(nn.Linear(16, 1418), nn.Sigmoid())
        
    def forward(self, inputValue):
        mid = self.embed(inputValue)
        out1 = self.layer1(mid)
        out2 = self.layer2(mid)
        return out1, out2
    
    def get_feature(self, inputValue):
        return self.embed(inputValue.long())

In [221]:
model = EmbedNet()
loss_func = nn.BCEWithLogitsLoss() # binary CrossEntropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) #ADAM

In [222]:
training_loss_hist = []
validation_loss_hist = []

In [224]:
for epoch in range(10):
    epoch_loss = []
    
    model.train()
    for batch_geo1, batch_geo2, batch_geo3 in tqdm(train_loader):
        output1, output2 = model(batch_geo3)
        loss1 = loss_func(output1, batch_geo1)
        loss2 = loss_func(output2, batch_geo2)
        loss = loss1 + loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss.append(float(loss))
        
    model.eval()
    y_pred1, y_pred2 = model(geo3_valid)
    
    acc1 = (torch.argmax(y_pred1, 1) == torch.argmax(geo1_valid, 1)).float().mean()
    acc2 = (torch.argmax(y_pred2, 1) == torch.argmax(geo2_valid, 1)).float().mean()
    print(f'epoch: {epoch} acc1: {acc1}, acc2: {acc2}')
    
    training_loss_hist.append(np.mean(epoch_loss))
    validation_loss_hist.append(float(loss_func(y_pred1, geo1_valid) + loss_func(y_pred2, geo2_valid)))

100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:27<00:00, 157.28it/s]


epoch: 0 acc1: 0.039240796118974686, acc2: 1.4389731404662598e-05


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:27<00:00, 156.15it/s]


epoch: 1 acc1: 0.05745819956064224, acc2: 0.0007482660585083067


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:28<00:00, 154.71it/s]


epoch: 2 acc1: 0.07484099268913269, acc2: 0.00047486115363426507


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:29<00:00, 147.60it/s]


epoch: 3 acc1: 0.07973350584506989, acc2: 0.0005468098097480834


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:30<00:00, 144.61it/s]


epoch: 4 acc1: 0.05106915533542633, acc2: 0.0005468098097480834


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:30<00:00, 140.40it/s]


epoch: 5 acc1: 0.05098281800746918, acc2: 0.0005468098097480834


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:30<00:00, 144.33it/s]


epoch: 6 acc1: 0.05158718675374985, acc2: 0.0005611995002254844


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:31<00:00, 139.44it/s]


epoch: 7 acc1: 0.05301177129149437, acc2: 0.0006187584367580712


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:30<00:00, 141.94it/s]


epoch: 8 acc1: 0.013555127196013927, acc2: 0.0006187584367580712


100%|█████████████████████████████████████████████████████████████████████████████| 4344/4344 [00:30<00:00, 142.91it/s]


epoch: 9 acc1: 0.01352634746581316, acc2: 0.0007626557489857078


In [225]:
torch.save(model, "embed3")

In [226]:
embedModel = torch.load("embed3")
embedModel.eval()

EmbedNet(
  (embed): Embedding(12568, 16)
  (layer1): Sequential(
    (0): Linear(in_features=16, out_features=31, bias=True)
    (1): Sigmoid()
  )
  (layer2): Sequential(
    (0): Linear(in_features=16, out_features=1418, bias=True)
    (1): Sigmoid()
  )
)

Add new features to training dataset

In [227]:
geo3 = train_x["geo_level_3_id"].to_numpy()
geo3 = torch.tensor(geo3, dtype = torch.long)
feature = embedModel.get_feature(geo3).detach().numpy()

In [228]:
a = pd.DataFrame(feature)
a.columns = ['geo_feat1',
       'geo_feat2', 'geo_feat3', 'geo_feat4', 'geo_feat5', 'geo_feat6',
       'geo_feat7', 'geo_feat8', 'geo_feat9', 'geo_feat10', 'geo_feat11',
       'geo_feat12', 'geo_feat13', 'geo_feat14', 'geo_feat15', 'geo_feat16']
a

Unnamed: 0,geo_feat1,geo_feat2,geo_feat3,geo_feat4,geo_feat5,geo_feat6,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,1.859609,-2.464615,2.333843,-2.766396,-2.544667,0.006577,1.614740,0.310817,1.294207,2.312045,1.099191,-2.694324,-3.038431,-0.395711,-2.098679,2.379443
1,1.256276,-1.346431,1.864876,-1.247013,-1.164752,-0.183126,1.319128,1.210872,0.418101,0.085968,3.220839,-2.756840,-0.014923,-1.524271,1.819106,-0.398256
2,3.349025,-2.026550,2.095108,-0.456444,-3.202587,2.447638,1.132275,-0.123751,1.817610,1.783720,0.844055,-0.183501,-2.643142,-3.025988,-2.238920,1.748637
3,0.799300,-1.857670,2.678128,-0.564379,-0.338822,0.552539,1.172541,-2.127022,-0.426082,0.068310,1.287670,-0.564273,0.733162,-1.630967,-0.493488,-0.772391
4,3.551296,-1.381974,0.881864,-2.512261,0.349712,0.376940,1.559291,-0.273731,2.073217,2.099913,2.866739,-1.711959,-1.696779,-1.269688,0.910058,-1.436769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,2.148874,-2.001442,2.245711,-1.050846,-2.390093,2.242368,3.085013,0.736553,2.588622,2.770101,0.798208,-0.344219,-2.675036,-1.896107,-2.149732,1.634088
260597,3.366335,-0.736915,2.416538,0.168565,-0.939973,1.158304,1.931297,2.721289,3.253176,0.848019,1.506234,-0.499194,-2.390478,0.277845,0.590935,0.323059
260598,2.024516,-0.775802,-0.138778,-0.885149,-1.246012,0.184863,0.822100,-2.962217,0.678390,2.210722,2.411864,-3.053804,-2.476892,-3.256753,0.190099,2.023416
260599,1.692824,-2.603646,1.491476,0.699806,-0.472868,-0.188209,1.991306,0.114091,-0.922991,1.797249,2.552944,-1.262327,-0.455509,-1.317010,0.762622,1.236359


In [229]:
train_data = train_x
train_data = train_data.reset_index(drop=True)
train_data = pd.concat([train_data, a],axis=1)
train_data

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,6,487,12198,2,30,6,5,t,r,n,...,1.614740,0.310817,1.294207,2.312045,1.099191,-2.694324,-3.038431,-0.395711,-2.098679,2.379443
1,8,900,2812,2,10,8,7,o,r,n,...,1.319128,1.210872,0.418101,0.085968,3.220839,-2.756840,-0.014923,-1.524271,1.819106,-0.398256
2,21,363,8973,2,10,5,5,t,r,n,...,1.132275,-0.123751,1.817610,1.783720,0.844055,-0.183501,-2.643142,-3.025988,-2.238920,1.748637
3,22,418,10694,2,10,6,5,t,r,n,...,1.172541,-2.127022,-0.426082,0.068310,1.287670,-0.564273,0.733162,-1.630967,-0.493488,-0.772391
4,11,131,1488,3,30,8,9,t,r,n,...,1.559291,-0.273731,2.073217,2.099913,2.866739,-1.711959,-1.696779,-1.269688,0.910058,-1.436769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,n,r,n,...,3.085013,0.736553,2.588622,2.770101,0.798208,-0.344219,-2.675036,-1.896107,-2.149732,1.634088
260597,17,715,2060,2,0,6,5,t,r,n,...,1.931297,2.721289,3.253176,0.848019,1.506234,-0.499194,-2.390478,0.277845,0.590935,0.323059
260598,17,51,8163,3,55,6,7,t,r,q,...,0.822100,-2.962217,0.678390,2.210722,2.411864,-3.053804,-2.476892,-3.256753,0.190099,2.023416
260599,26,39,1851,2,10,14,6,t,r,x,...,1.991306,0.114091,-0.922991,1.797249,2.552944,-1.262327,-0.455509,-1.317010,0.762622,1.236359


In [230]:
train_datai = train_data
train_yi = train_y

Add new features to test dataset

In [231]:
geo3_test = test_x["geo_level_3_id"].to_numpy()
geo3_test = torch.tensor(geo3_test, dtype = torch.long)
test_feature = embedModel.get_feature(geo3_test).detach().numpy()

In [232]:
b = pd.DataFrame(test_feature)
b.columns = ['geo_feat1',
       'geo_feat2', 'geo_feat3', 'geo_feat4', 'geo_feat5', 'geo_feat6',
       'geo_feat7', 'geo_feat8', 'geo_feat9', 'geo_feat10', 'geo_feat11',
       'geo_feat12', 'geo_feat13', 'geo_feat14', 'geo_feat15', 'geo_feat16']
b

Unnamed: 0,geo_feat1,geo_feat2,geo_feat3,geo_feat4,geo_feat5,geo_feat6,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,0.154209,-1.855160,1.253630,-0.789279,-1.896688,2.249731,1.419226,1.257861,1.160597,1.269671,0.353483,-1.042975,0.555544,-1.673433,-1.057855,1.113611
1,0.121295,-0.415216,1.014652,-2.486992,-2.975242,0.373346,0.895863,1.330957,0.251000,1.640625,2.393336,-1.281636,-0.325237,-1.758530,0.154713,-0.751193
2,2.987561,-0.780578,2.619226,-2.322473,-1.810810,1.877750,1.436114,-2.764751,3.437391,0.452544,1.102489,-0.453919,-3.045758,-1.204425,-0.880961,1.352055
3,0.456954,-2.972200,1.303150,-0.638250,-1.560437,-0.000236,1.775547,1.017376,-0.760987,1.239240,2.199309,-3.075414,0.700020,-2.607163,-0.114532,0.055895
4,-1.038869,-0.778381,0.718419,-2.200065,-1.932974,3.544711,2.211908,0.376664,1.395510,0.647183,2.215909,-0.905522,-0.988486,-2.308328,-0.729297,-0.070338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,-0.863541,0.128134,0.200570,-0.551044,-0.542065,1.886126,1.508831,-0.851051,0.060478,0.946763,5.134702,-0.521289,0.122116,-1.009386,0.520732,-0.149688
86864,1.352502,-3.381880,0.936152,-1.600487,0.048578,-0.013355,3.104919,-0.716980,0.639247,0.094971,0.621013,-1.540782,-0.095112,1.185019,2.525274,0.647403
86865,2.767267,-2.820536,1.210562,-3.086769,-3.763468,3.135728,1.954569,-0.181075,2.969114,1.719081,2.619495,-1.152444,-2.102620,-2.333096,-3.045663,2.381864
86866,2.787802,0.583902,0.547729,-2.081654,-3.033055,1.247005,1.409003,1.109057,1.141922,1.679662,3.395210,0.321554,-0.880071,-0.629335,0.099519,1.869352


In [233]:
test_data = test_x
test_data = test_data.reset_index(drop=True)
test_data = pd.concat([test_data, b],axis=1)
test_data

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,17,596,11307,3,20,7,6,t,r,n,...,1.419226,1.257861,1.160597,1.269671,0.353483,-1.042975,0.555544,-1.673433,-1.057855,1.113611
1,6,141,11987,2,25,13,5,t,r,n,...,0.895863,1.330957,0.251000,1.640625,2.393336,-1.281636,-0.325237,-1.758530,0.154713,-0.751193
2,22,19,10044,2,5,4,5,t,r,n,...,1.436114,-2.764751,3.437391,0.452544,1.102489,-0.453919,-3.045758,-1.204425,-0.880961,1.352055
3,26,39,633,1,0,19,3,t,r,x,...,1.775547,1.017376,-0.760987,1.239240,2.199309,-3.075414,0.700020,-2.607163,-0.114532,0.055895
4,17,289,7970,3,15,8,7,t,r,q,...,2.211908,0.376664,1.395510,0.647183,2.215909,-0.905522,-0.988486,-2.308328,-0.729297,-0.070338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,4,605,3623,3,70,20,6,t,r,q,...,1.508831,-0.851051,0.060478,0.946763,5.134702,-0.521289,0.122116,-1.009386,0.520732,-0.149688
86864,10,1407,11907,3,25,6,7,n,r,n,...,3.104919,-0.716980,0.639247,0.094971,0.621013,-1.540782,-0.095112,1.185019,2.525274,0.647403
86865,22,1136,7712,1,50,3,3,t,r,n,...,1.954569,-0.181075,2.969114,1.719081,2.619495,-1.152444,-2.102620,-2.333096,-3.045663,2.381864
86866,6,1041,912,2,5,9,5,t,r,n,...,1.409003,1.109057,1.141922,1.679662,3.395210,0.321554,-0.880071,-0.629335,0.099519,1.869352


In [234]:
test_datai = test_data

Change categorical variable to numeric

In [235]:
cleanup_letters = {"land_surface_condition": {"n": 1, "o": 2, "t":3},
                "foundation_type": {"h": 1, "i": 2, "r": 3, "u": 4, "w": 5},               
                "roof_type": {"n": 1, "q": 2, "x": 3},
                "ground_floor_type": {"f": 1, "m": 2, "v": 3, "x": 4, "z": 5},
                "other_floor_type": {"j": 1, "q": 2, "s": 3, "x": 4},
                "position": {"j": 1, "o": 2, "s": 3, "t": 4},
                "plan_configuration": {"a": 1, "c": 2, "d": 3, "f": 4, "m": 5, "n": 6, "o": 7, "q": 8, "s": 9, "u":10},
                "legal_ownership_status": {"a": 1, "r": 2, "v": 3, "w": 4}             
               }
train_datai.replace(cleanup_letters, inplace=True)
test_datai.replace(cleanup_letters, inplace=True)

In [236]:
def threshold_arr(array):
    # Get major confidence-scored predicted value.
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

In [237]:
SEED = 1881

In [238]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold

# **Random Forest**

Training

In [239]:
from joblib import dump, load

# Assuming your features are in x and target variable in y
y = np.array(train_y["damage_grade"]-1)
x = np.array(train_datai)

# Create the stratified K-fold object
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the random forest classifier
rf = RandomForestClassifier()

# Loop through the splits and train and evaluate the model
# for train_index, test_index in skf.split(x, y):
#     x_train, x_test = x[train_index], x[test_index]
#     y_train, y_test = y[train_index], y[test_index]
for ix, (train_index, test_index) in enumerate(skf.split(x,y)):
    x_train, x_test, y_train, y_test= x[train_index], x[test_index], y[train_index], y[test_index]
   
    # Train the random forest classifier
    rf.fit(x_train, y_train)

    # Make predictions
    y_pred = rf.predict(x_test)
    #y_pred = np.array(pd.get_dummies(y_pred))
    
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_test)), np.array(pd.get_dummies(y_pred)), average='micro'))
    dump(rf, f'models/rfc_model{ix}.joblib')

F1-MICRO SCORE:  0.7251203929318317
F1-MICRO SCORE:  0.7277628549501152
F1-MICRO SCORE:  0.7263814274750575
F1-MICRO SCORE:  0.7268994627782043
F1-MICRO SCORE:  0.725748273215656


Submission


In [240]:
test_datai

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,17,596,11307,3,20,7,6,3,3,1,...,1.419226,1.257861,1.160597,1.269671,0.353483,-1.042975,0.555544,-1.673433,-1.057855,1.113611
1,6,141,11987,2,25,13,5,3,3,1,...,0.895863,1.330957,0.251000,1.640625,2.393336,-1.281636,-0.325237,-1.758530,0.154713,-0.751193
2,22,19,10044,2,5,4,5,3,3,1,...,1.436114,-2.764751,3.437391,0.452544,1.102489,-0.453919,-3.045758,-1.204425,-0.880961,1.352055
3,26,39,633,1,0,19,3,3,3,3,...,1.775547,1.017376,-0.760987,1.239240,2.199309,-3.075414,0.700020,-2.607163,-0.114532,0.055895
4,17,289,7970,3,15,8,7,3,3,2,...,2.211908,0.376664,1.395510,0.647183,2.215909,-0.905522,-0.988486,-2.308328,-0.729297,-0.070338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,4,605,3623,3,70,20,6,3,3,2,...,1.508831,-0.851051,0.060478,0.946763,5.134702,-0.521289,0.122116,-1.009386,0.520732,-0.149688
86864,10,1407,11907,3,25,6,7,1,3,1,...,3.104919,-0.716980,0.639247,0.094971,0.621013,-1.540782,-0.095112,1.185019,2.525274,0.647403
86865,22,1136,7712,1,50,3,3,3,3,1,...,1.954569,-0.181075,2.969114,1.719081,2.619495,-1.152444,-2.102620,-2.333096,-3.045663,2.381864
86866,6,1041,912,2,5,9,5,3,3,1,...,1.409003,1.109057,1.141922,1.679662,3.395210,0.321554,-0.880071,-0.629335,0.099519,1.869352


In [241]:
# Make predictions
df = test_datai
x = np.array(df)
y_pred = rf.predict(x)
y_pred+=1

In [242]:
sub_csv = pd.read_csv("submission_format.csv")
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)

# **XGBoost**

In [243]:
def ensemble(models, x):
    # Ensemble K-Fold CV models with adding all confidence score by class.
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

Training

In [244]:
y = np.array(train_y["damage_grade"]-1)
x = np.array(train_datai)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(skf.split(x,y)):
    x_train, x_valid, y_train, y_valid= x[train_index], x[test_index], y[train_index], y[test_index]
    
    model = XGBClassifier(max_depth = 10, 
                          learning_rate = 0.05,
                          n_estimators = 2000,
                          min_child_weight = 2,
                          objective = 'multiclass',
                          use_label_encoder = False)

    eval_set = [(x_valid, y_valid)]
    model.fit(x_train, y_train, eval_set = eval_set, verbose = 1, early_stopping_rounds = 150)

    y_pred = model.predict(x_valid)
    y_pred = np.array(pd.get_dummies(y_pred))
    
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_valid)), y_pred, average='micro'))
    model.save_model(f'models/XGBoost{ix}.txt')



[0]	validation_0-mlogloss:1.07076
[1]	validation_0-mlogloss:1.04510
[2]	validation_0-mlogloss:1.02124
[3]	validation_0-mlogloss:0.99915
[4]	validation_0-mlogloss:0.97849
[5]	validation_0-mlogloss:0.95939
[6]	validation_0-mlogloss:0.94132
[7]	validation_0-mlogloss:0.92439
[8]	validation_0-mlogloss:0.90890
[9]	validation_0-mlogloss:0.89428
[10]	validation_0-mlogloss:0.88088
[11]	validation_0-mlogloss:0.86835
[12]	validation_0-mlogloss:0.85661
[13]	validation_0-mlogloss:0.84557
[14]	validation_0-mlogloss:0.83520
[15]	validation_0-mlogloss:0.82545
[16]	validation_0-mlogloss:0.81624
[17]	validation_0-mlogloss:0.80753
[18]	validation_0-mlogloss:0.79939
[19]	validation_0-mlogloss:0.79169
[20]	validation_0-mlogloss:0.78444
[21]	validation_0-mlogloss:0.77711
[22]	validation_0-mlogloss:0.77035
[23]	validation_0-mlogloss:0.76391
[24]	validation_0-mlogloss:0.75787
[25]	validation_0-mlogloss:0.75209
[26]	validation_0-mlogloss:0.74657
[27]	validation_0-mlogloss:0.74107
[28]	validation_0-mlogloss:0.7



[0]	validation_0-mlogloss:1.07093
[1]	validation_0-mlogloss:1.04545
[2]	validation_0-mlogloss:1.02192
[3]	validation_0-mlogloss:1.00010
[4]	validation_0-mlogloss:0.97983
[5]	validation_0-mlogloss:0.96080
[6]	validation_0-mlogloss:0.94330
[7]	validation_0-mlogloss:0.92669
[8]	validation_0-mlogloss:0.91125
[9]	validation_0-mlogloss:0.89682
[10]	validation_0-mlogloss:0.88344
[11]	validation_0-mlogloss:0.87087
[12]	validation_0-mlogloss:0.85929
[13]	validation_0-mlogloss:0.84826
[14]	validation_0-mlogloss:0.83786
[15]	validation_0-mlogloss:0.82807
[16]	validation_0-mlogloss:0.81884
[17]	validation_0-mlogloss:0.81016
[18]	validation_0-mlogloss:0.80195
[19]	validation_0-mlogloss:0.79424
[20]	validation_0-mlogloss:0.78700
[21]	validation_0-mlogloss:0.77976
[22]	validation_0-mlogloss:0.77326
[23]	validation_0-mlogloss:0.76681
[24]	validation_0-mlogloss:0.76070
[25]	validation_0-mlogloss:0.75496
[26]	validation_0-mlogloss:0.74938
[27]	validation_0-mlogloss:0.74412
[28]	validation_0-mlogloss:0.7



[0]	validation_0-mlogloss:1.07061
[1]	validation_0-mlogloss:1.04482
[2]	validation_0-mlogloss:1.02091
[3]	validation_0-mlogloss:0.99883
[4]	validation_0-mlogloss:0.97834
[5]	validation_0-mlogloss:0.95917
[6]	validation_0-mlogloss:0.94123
[7]	validation_0-mlogloss:0.92457
[8]	validation_0-mlogloss:0.90896
[9]	validation_0-mlogloss:0.89447
[10]	validation_0-mlogloss:0.88114
[11]	validation_0-mlogloss:0.86861
[12]	validation_0-mlogloss:0.85689
[13]	validation_0-mlogloss:0.84581
[14]	validation_0-mlogloss:0.83551
[15]	validation_0-mlogloss:0.82575
[16]	validation_0-mlogloss:0.81654
[17]	validation_0-mlogloss:0.80786
[18]	validation_0-mlogloss:0.79963
[19]	validation_0-mlogloss:0.79192
[20]	validation_0-mlogloss:0.78465
[21]	validation_0-mlogloss:0.77779
[22]	validation_0-mlogloss:0.77132
[23]	validation_0-mlogloss:0.76502
[24]	validation_0-mlogloss:0.75887
[25]	validation_0-mlogloss:0.75299
[26]	validation_0-mlogloss:0.74752
[27]	validation_0-mlogloss:0.74224
[28]	validation_0-mlogloss:0.7



[0]	validation_0-mlogloss:1.07089
[1]	validation_0-mlogloss:1.04530
[2]	validation_0-mlogloss:1.02164
[3]	validation_0-mlogloss:0.99974
[4]	validation_0-mlogloss:0.97944
[5]	validation_0-mlogloss:0.96047
[6]	validation_0-mlogloss:0.94258
[7]	validation_0-mlogloss:0.92592
[8]	validation_0-mlogloss:0.91051
[9]	validation_0-mlogloss:0.89610
[10]	validation_0-mlogloss:0.88258
[11]	validation_0-mlogloss:0.86999
[12]	validation_0-mlogloss:0.85808
[13]	validation_0-mlogloss:0.84690
[14]	validation_0-mlogloss:0.83641
[15]	validation_0-mlogloss:0.82654
[16]	validation_0-mlogloss:0.81714
[17]	validation_0-mlogloss:0.80843
[18]	validation_0-mlogloss:0.80008
[19]	validation_0-mlogloss:0.79228
[20]	validation_0-mlogloss:0.78479
[21]	validation_0-mlogloss:0.77783
[22]	validation_0-mlogloss:0.77112
[23]	validation_0-mlogloss:0.76457
[24]	validation_0-mlogloss:0.75838
[25]	validation_0-mlogloss:0.75250
[26]	validation_0-mlogloss:0.74696
[27]	validation_0-mlogloss:0.74188
[28]	validation_0-mlogloss:0.7



[0]	validation_0-mlogloss:1.07074
[1]	validation_0-mlogloss:1.04510
[2]	validation_0-mlogloss:1.02130
[3]	validation_0-mlogloss:0.99925
[4]	validation_0-mlogloss:0.97866
[5]	validation_0-mlogloss:0.95965
[6]	validation_0-mlogloss:0.94176
[7]	validation_0-mlogloss:0.92504
[8]	validation_0-mlogloss:0.90950
[9]	validation_0-mlogloss:0.89517
[10]	validation_0-mlogloss:0.88175
[11]	validation_0-mlogloss:0.86918
[12]	validation_0-mlogloss:0.85740
[13]	validation_0-mlogloss:0.84633
[14]	validation_0-mlogloss:0.83590
[15]	validation_0-mlogloss:0.82620
[16]	validation_0-mlogloss:0.81697
[17]	validation_0-mlogloss:0.80806
[18]	validation_0-mlogloss:0.79990
[19]	validation_0-mlogloss:0.79214
[20]	validation_0-mlogloss:0.78481
[21]	validation_0-mlogloss:0.77774
[22]	validation_0-mlogloss:0.77117
[23]	validation_0-mlogloss:0.76490
[24]	validation_0-mlogloss:0.75906
[25]	validation_0-mlogloss:0.75320
[26]	validation_0-mlogloss:0.74768
[27]	validation_0-mlogloss:0.74241
[28]	validation_0-mlogloss:0.7

In [245]:
x = np.array(train_datai)

models = []
for i in range(5):
    model = xgb.Booster()
    model.load_model(f'models/XGBoost{i}.txt')

    dtrain = xgb.DMatrix(x)
    
    y_pred = model.predict(dtrain)
    
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8357335543608811
F1-MICRO SCORE:  0.8376828945399287
F1-MICRO SCORE:  0.8398202616260106
F1-MICRO SCORE:  0.8353843615335321
F1-MICRO SCORE:  0.8384695377224186


submission


In [246]:
df = test_datai
x = np.array(df)
dtest = xgb.DMatrix(x)
y_pred = ensemble(models, dtest)
y_pred = y_pred.argmax(axis=1)+1
sub_csv = pd.read_csv("submission_format.csv")
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)

In [247]:
y_pred

array([3, 2, 2, ..., 2, 2, 1], dtype=int64)

# **Catboost**

training


In [None]:
# from catboost import CatBoostClassifier


In [None]:
# y = np.array(train_y["damage_grade"] - 1)
# x = np.array(train_datai)

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
# for ix, (train_index, test_index) in enumerate(skf.split(x, y)):
#     x_train, x_valid, y_train, y_valid = x[train_index], x[test_index], y[train_index], y[test_index]

#     model = CatBoostClassifier(iterations=10000,
#                                learning_rate=0.1,
#                                depth=10,
#                                loss_function='MultiClass',
#                                task_type="CPU",
#                                random_seed=SEED,
#                                od_type='Iter',
#                                od_wait=1000,
#                                verbose=1000)

#     model.fit(x_train, y_train,
#               eval_set=(x_valid, y_valid),
#               early_stopping_rounds=1000)

#     y_pred = model.predict(x_valid)
#     print("F1-MICRO SCORE: ", f1_score(y_valid, y_pred, average='micro'))
#     model.save_model(f'models/CatBoost{ix}.cbm')


In [None]:
# x = np.array(train_datai)
# models = []
# for i in range(5):
#     model = CatBoostClassifier()
#     model.load_model(f'models/CatBoost{i}.cbm')

#     y_pred = model.predict(x)
#     score = f1_score(y, y_pred, average='micro')
#     print("F1-MICRO SCORE: ", score)
#     models.append(model)


submission

In [None]:
# test_datai

In [None]:
# df = test_datai
# x = np.array(df)
# model = CatBoostClassifier()
# model.load_model(f'models/CatBoost{0}.cbm')
# y_pred = model.predict(x)
# #y_pred = ensemble(models, x)
# y_pred = y_pred.flatten()+1
# sub_csv = pd.read_csv("submission_format.csv")
# sub_csv["damage_grade"] = y_pred
# sub_csv.to_csv("submission.csv", index=False)

In [None]:
# y_pred

# **LightGBM**

Training


In [248]:
y = np.array(train_y["damage_grade"]-1)
x = np.array(train_datai)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(skf.split(x,y)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "multi_error",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }
    
    x_train, x_valid, y_train, y_valid= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_valid, label=y_valid)

    model = lgb.train(lgb_params,
                      train_data,
                      10000,
                      valid_sets = [valid_data],
                      early_stopping_rounds=1000,
                      verbose_eval = 1000)

    y_pred = model.predict(x_valid)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_valid)), threshold_arr(y_pred), average='micro'))
    model.save_model(f'models/LGBM{ix}.txt')



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129695
[LightGBM] [Info] Number of data points in the train set: 208480, number of used features: 53
[LightGBM] [Info] Start training from score -2.339173
[LightGBM] [Info] Start training from score -0.564028
[LightGBM] [Info] Start training from score -1.094582
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's multi_error: 0.249957
[2000]	valid_0's multi_error: 0.248921
Early stopping, best iteration is:
[1689]	valid_0's multi_error: 0.248479
F1-MICRO SCORE:  0.7515205003741295




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129656
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 54
[LightGBM] [Info] Start training from score -2.339128
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094586
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's multi_error: 0.25282
[2000]	valid_0's multi_error: 0.25213
Early stopping, best iteration is:
[1936]	valid_0's multi_error: 0.251592
F1-MICRO SCORE:  0.748407521105142




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129789
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 53
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564024
[LightGBM] [Info] Start training from score -1.094586
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's multi_error: 0.248791
[2000]	valid_0's multi_error: 0.248062
Early stopping, best iteration is:
[1754]	valid_0's multi_error: 0.247467
F1-MICRO SCORE:  0.7525326170376055




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129643
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 54
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094572
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's multi_error: 0.24906
[2000]	valid_0's multi_error: 0.248676
Early stopping, best iteration is:
[1570]	valid_0's multi_error: 0.248101
F1-MICRO SCORE:  0.7518994627782042




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129728
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 53
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094572
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's multi_error: 0.251074
[2000]	valid_0's multi_error: 0.250748
Early stopping, best iteration is:
[1343]	valid_0's multi_error: 0.249482
F1-MICRO SCORE:  0.7505180353031464


In [249]:
models = []
for i in range(5):
    model = lgb.Booster(model_file=f'models/LGBM{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8131127662595309
F1-MICRO SCORE:  0.8199085958994785
F1-MICRO SCORE:  0.8157451429580087
F1-MICRO SCORE:  0.8102117796938616
F1-MICRO SCORE:  0.8040989865733439


Submission

In [250]:
df = test_datai
x = np.array(df)
y_pred = ensemble(models, x)
y_pred = y_pred.argmax(axis=1)+1
sub_csv = pd.read_csv("submission_format.csv")
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)

In [251]:
y_pred

array([3, 2, 2, ..., 2, 2, 1], dtype=int64)

# **Ensemble Method**

In [253]:
from joblib import dump, load
x = np.array(test_datai)

sum_y_pred = np.zeros((86868, 3))
    
for i in range(5):
    # emsemble XGBooster
    model = xgb.Booster()
    model.load_model(f'models/XGBoost{i}.txt')
    dtrain = xgb.DMatrix(x)
    y_pred = model.predict(dtrain)
    sum_y_pred += y_pred*0.2

    # ensemble random forest
    model =  RandomForestClassifier()
    model = load(f'models/rfc_model{i}.joblib')
    y_pred = model.predict_proba(x)
    sum_y_pred += y_pred*0.1    
    
    # emsemble LightGBM
    model = lgb.Booster(model_file=f'models/LGBM{i}.txt')
    y_pred = model.predict(x)
    sum_y_pred += y_pred*0.7
    
    print(sum_y_pred)

    
y_pred = threshold_arr(sum_y_pred)
y_pred = y_pred.argmax(axis=1)+1

[[0.00222482 0.32459972 0.67317545]
 [0.00243059 0.91600619 0.08156321]
 [0.01977066 0.57681565 0.40341371]
 ...
 [0.03644223 0.87300147 0.09055629]
 [0.01058904 0.8828498  0.10656116]
 [0.41290142 0.55438049 0.0327181 ]]
[[0.00334643 0.48679637 1.5098572 ]
 [0.00495091 1.79910431 0.19594478]
 [0.04257378 1.26491594 0.69251031]
 ...
 [0.05333405 1.77194489 0.17472106]
 [0.01317076 1.73508126 0.25174798]
 [0.9105141  1.03773982 0.0517461 ]]
[[0.00413543 0.70239189 2.29347267]
 [0.00710838 2.71157138 0.28132023]
 [0.06315915 1.79153972 1.14530117]
 ...
 [0.08801278 2.63127844 0.28070879]
 [0.02039129 2.60311201 0.37649671]
 [1.59241842 1.34260683 0.06497478]]
[[0.00583199 0.9902181  3.0039499 ]
 [0.00981942 3.5922104  0.39797016]
 [0.08825576 2.34290371 1.56884056]
 ...
 [0.11388536 3.50048411 0.38563054]
 [0.0275291  3.41327711 0.55919382]
 [2.28552719 1.62771437 0.08675846]]
[[0.00702012 1.24860255 3.74437732]
 [0.01150886 4.47475282 0.51373831]
 [0.11626061 2.91004982 1.9736896 ]
 ...

In [254]:
y_pred

array([3, 2, 2, ..., 2, 2, 1], dtype=int64)

In [255]:
sub_csv = pd.read_csv("submission_format.csv")
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)