In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
# ha-ha classics

import numpy as np 

import pandas as pd

import matplotlib.pyplot as plt

import sklearn

import os 

# sklearn

from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import precision_recall_curve, roc_curve

# 2nd category

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier as lgbm

# 3rd category

import torch

import torch.nn as nn
import torch.optim as optim

# misc
from tqdm import tqdm



#### Data Load


In [63]:
train = pd.read_csv('data/train_values (1).csv')
labels = pd.read_csv('data/train_labels (1).csv')
test = pd.read_csv('data/test_values (1).csv')

#### Preprocessing

In [64]:
b_ids = train.columns[0]
geo_levels = train.columns[1:4]
numeric = train.columns[4:8]
categorical = list(train.columns[8:15])
categorical.append(train.columns[26])
flags = train.columns[15:]

In [65]:
def fe(dataset):
    '''
    dataset - supposed to be train or test pd.DataFrames
    
    '''
    
    dataset['height_area_ratio'] = dataset['height_percentage']/dataset['area_percentage']
    dataset['is_old'] = dataset['age']>10

#### K-means clustering for geolevels

In [66]:
enc_cols = ['cluster_' + str(i) for i in range(17)]
print(enc_cols)

def append_clusters_to_dataset(main_set, clustering):
    enc = OneHotEncoder()
    ohe = enc.fit_transform(clustering.reshape(-1,1))
    ohe = ohe.toarray()
    add_on = pd.DataFrame(ohe,columns = enc_cols)
    
    X = pd.concat([main_set,add_on],axis = 1)
    return X

['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 'cluster_10', 'cluster_11', 'cluster_12', 'cluster_13', 'cluster_14', 'cluster_15', 'cluster_16']


In [67]:
from sklearn.cluster import KMeans

def create_clustering_data(dataset): #instance of np.ndarray
    subdata = dataset[:,1:4].copy()
    return subdata

def perform_clustering(subdata,col_weights = [100,10,1]):
    for i in range(len(col_weights)):
        subdata[:,i] *= col_weights[i]
        
    #print(subdata.shape)
    print('...clustering in process...')
    clustering = KMeans(17).fit(subdata)
    print('...clustering done.')
    #print(clustering.labels_.shape)
    
    return clustering.labels_


In [68]:
sub_train = create_clustering_data(np.array(train))
clustering_train = perform_clustering(sub_train)

...clustering in process...
...clustering done.


In [69]:
train = append_clusters_to_dataset(train, clustering_train)

#### Processing check

In [70]:
sub_test = create_clustering_data(np.array(test))
clustering_test = perform_clustering(sub_test)

...clustering in process...
...clustering done.


In [71]:
test = append_clusters_to_dataset(test, clustering_test)
print(test.shape)

(86868, 56)


#### StandartScaling

In [72]:
enc = StandardScaler()

train[numeric] = enc.fit_transform(train[numeric])
test[numeric] = enc.fit_transform(test[numeric])

In [73]:
def ohe(data,categorical):
    
    enc = OneHotEncoder()
    one_hot = enc.fit_transform(data[categorical]).toarray()
    #print(one_hot.shape)
    
    cols = categorical
    #print(cols)
    
    cats = enc.categories_
    #print(cats[0])

    new_column_names = []
    for k in range(len(cols)):
        for levels in range(len(cats[k])):
            #print()
            new_column_names.append(cols[k] + '_' + cats[k][levels])
            
    #print(new_column_names)     
    
    categorical_dataframe = pd.DataFrame(one_hot, columns = new_column_names)
    new_dataset = pd.concat([data.drop(categorical,axis = 1),categorical_dataframe],axis = 1)
    return new_dataset

train = ohe(train,categorical)
test = ohe(test,categorical)

In [74]:
def drop_geo(dataset):
    for cols in dataset.columns:
        if 'geo' in cols:
            dataset.drop(cols,inplace = True,axis = 1)
#drop_geo(train)
#drop_geo(test)

In [75]:
building_ids = test['building_id'].copy()
train.drop('building_id',axis = 1,inplace = True)
test.drop('building_id',axis = 1,inplace = True)

In [76]:
train = np.array(train)
test = np.array(test)

#### Train-test-split

In [77]:
y = np.array(labels['damage_grade']).ravel()
np.unique(y)

array([1, 2, 3], dtype=int64)

In [78]:
X_train,X_test,y_train,y_test = train_test_split(train,
                                                 y,
                                                 random_state = 1003,
                                                test_size = 0.2)

#### Model

In [41]:
from xgboost import XGBClassifier

In [58]:
xgb = XGBClassifier(n_estimators = 100,
                    max_depth = 12,
                    learning_rate = 0.5,
                    verbosity = 1,
                    booster = 'gbtree',
                    n_jobs = 4,
                    subsample = 0.9,
                    num_parallel_tree=4
                    )

xgb.fit(X_train,
        y_train,
        early_stopping_rounds=5,
        verbose = 2,
        eval_set = [(X_test, y_test)],
        eval_metric = "mlogloss"
       )

[0]	validation_0-mlogloss:0.85956
Will train until validation_0-mlogloss hasn't improved in 5 rounds.
[2]	validation_0-mlogloss:0.70237
[4]	validation_0-mlogloss:0.64999
[6]	validation_0-mlogloss:0.62732
[8]	validation_0-mlogloss:0.61442
[10]	validation_0-mlogloss:0.60741
[12]	validation_0-mlogloss:0.60068
[14]	validation_0-mlogloss:0.59671
[16]	validation_0-mlogloss:0.59400
[18]	validation_0-mlogloss:0.59093
[20]	validation_0-mlogloss:0.58808
[22]	validation_0-mlogloss:0.58634
[24]	validation_0-mlogloss:0.58464
[26]	validation_0-mlogloss:0.58233
[28]	validation_0-mlogloss:0.58114
[30]	validation_0-mlogloss:0.57956
[32]	validation_0-mlogloss:0.57824
[34]	validation_0-mlogloss:0.57725
[36]	validation_0-mlogloss:0.57669
[38]	validation_0-mlogloss:0.57565
[40]	validation_0-mlogloss:0.57510
[42]	validation_0-mlogloss:0.57469
[44]	validation_0-mlogloss:0.57421
[46]	validation_0-mlogloss:0.57381
[48]	validation_0-mlogloss:0.57390
[50]	validation_0-mlogloss:0.57366
[52]	validation_0-mlogloss:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=4,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=1)

In [59]:
y_pred = xgb.predict(X_test)

In [60]:
from sklearn.metrics import f1_score

print('f1: {}'.format(f1_score(y_pred,y_test,average='micro')))

f1: 0.74522745150707


#### Outputting

In [79]:
y_all_pred = xgb.predict(np.array(test))

In [80]:
y_all_pred.shape

(86868,)

In [81]:
test_r = pd.read_csv('data/test_values (1).csv')
building_ids = test_r['building_id']

In [87]:
def create_submission(y_all_pred,building_ids,name):

    preds = y_all_pred
    sub = pd.DataFrame()
    sub['building_id'] = building_ids
    sub['damage_grade'] = y_all_pred
    sub.set_index('building_id',inplace = True)
    print(sub.shape)
    sub.to_csv('subs/' + name + '.csv')

In [88]:
name = 'sub_xgb_2'
create_submission(y_all_pred,building_ids,name)

(86868, 1)
