In [1]:
import pandas as pd
import numpy as np
import os
import gc
import math
import random
import shutil
from tqdm import tqdm
from glob import glob
import librosa
import warnings
from scipy.stats import skew

import xgboost as xgb
import tensorflow as tf

from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
path = './data/'

# 데이터 로드

In [2]:
# africa_train_paths = glob(path + "train/africa/*.wav")
# australia_train_paths = glob(path + "train/australia/*.wav")
# canada_train_paths = glob(path + "train/canada/*.wav")
# england_train_paths = glob(path + "train/england/*.wav")
# hongkong_train_paths = glob(path + "train/hongkong/*.wav")
# us_train_paths = glob(path + "train/us/*.wav")

# path_list = [africa_train_paths, australia_train_paths, canada_train_paths,
#              england_train_paths, hongkong_train_paths, us_train_paths]

In [3]:
# glob로 test data의 path를 불러올때 순서대로 로드되지 않을 경우를 주의해야 합니다.
# test_ 데이터 프레임을 만들어서 나중에 sample_submission과 id를 기준으로 merge시킬 준비를 합니다.

def get_id(data):
    return np.int(data.split("\\")[1].split(".")[0])

test_ = pd.DataFrame(index = range(0, 6100), columns = ["path", "id"])
test_["path"] = glob(path + "test/*.wav")
test_["id"] = test_["path"].apply(lambda x : get_id(x))

test_.head()

Unnamed: 0,path,id
0,./data/test\1.wav,1
1,./data/test\10.wav,10
2,./data/test\100.wav,100
3,./data/test\1000.wav,1000
4,./data/test\1001.wav,1001


In [4]:
# def load_data(paths):

#     result = []
#     for path in tqdm(paths):
#         # sr = 16000이 의미하는 것은 1초당 16000개의 데이터를 샘플링 한다는 것입니다.
#         data, sr = librosa.load(path, sr = 16000)
#         result.append(data)
#     result = np.array(result) 
#     # 메모리가 부족할 때는 데이터 타입을 변경해 주세요 ex) np.array(data, dtype = np.float32)

#     return result

In [5]:
# # train 데이터를 로드하기 위해서는 많은 시간이 소모 됩니다.
# # 따라서 추출된 정보를 npy파일로 저장하여 필요 할 때마다 불러올 수 있게 준비합니다.

# os.mkdir("./npy_data")

# africa_train_data = load_data(africa_train_paths)
# np.save("./npy_data/africa_npy", africa_train_data)

# australia_train_data = load_data(australia_train_paths)
# np.save("./npy_data/australia_npy", australia_train_data)

# canada_train_data = load_data(canada_train_paths)
# np.save("./npy_data/canada_npy", canada_train_data)

# england_train_data = load_data(england_train_paths)
# np.save("./npy_data/england_npy", england_train_data)

# hongkong_train_data = load_data(hongkong_train_paths)
# np.save("./npy_data/hongkong_npy", hongkong_train_data)

# us_train_data = load_data(us_train_paths)
# np.save("./npy_data/us_npy", us_train_data)

# test_data = load_data(test_["path"])
# np.save("./npy_data/test_npy", test_data)

In [6]:
def get_mini(data):

    mini = 9999999
    for i in data:
        if len(i) < mini:
            mini = len(i)

    return mini


def set_length(data, d_mini):

    result = []
    for i in data:
        result.append(i[:d_mini])
    result = np.array(result)

    return result


def get_feature(data, sr = 16000, n_fft = 2048, win_length = 2048, hop_length = 1024, n_mels = 32):
    mel = []
    for i in tqdm(data):
        mel_ = librosa.feature.melspectrogram(i, sr = sr, n_fft = n_fft, win_length = win_length, hop_length = hop_length, n_mels = n_mels)
        mel.append(mel_)
    mel = np.array(mel)
    mel = librosa.amplitude_to_db(mel, ref = np.max)

    mel_mean = mel.mean()
    mel_std = mel.std()
    mel = (mel - mel_mean) / mel_std

    return mel

In [7]:
def adding_white_noise(data, noise_rate=0.1):
    wn = np.random.randn(len(data))
    data_wn = data + noise_rate*wn
    
    return data_wn


def shifting_sound(data, roll_rate, mini=501):
    data_roll = np.roll(data, int(mini * roll_rate))
    
    return data_roll

In [8]:
# xgboost

def build_xgb(split_num, train, target, test):
    
    params = {
                'max_depth': 8,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':6,
                'tree_method':'gpu_hist'
                }
    
    test_pred = np.zeros((test.shape[0], 6))
    n_iter = 1
    
    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=2021)
    for train_idx, val_idx in skf.split(train, target):

        print(f'\n -------------------------------- {n_iter} 번째 fold -------------------------------- \n')
        
        X = train[train_idx]
        y = target[train_idx]
        valid_x = train[val_idx]
        valid_y = target[val_idx]

        d_train = xgb.DMatrix(X, y)
        d_valid = xgb.DMatrix(valid_x, valid_y)
        
        d_test = xgb.DMatrix(test)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        model = xgb.train(params, d_train, 500, watchlist, 
                        early_stopping_rounds=10,
                        verbose_eval=20)

        test_pred += model.predict(d_test)/split_num
        n_iter += 1
            
    return test_pred

In [9]:
def train_xgb(train_data_list, test_data, train_y):
    
    # data preprocessing & augmentation & TTA
    
    train_x = np.concatenate(train_data_list, axis= 0)
    test_x = np.array(test_data)
    
    train_mini = get_mini(train_x)
    test_mini = get_mini(test_x)
    mini = np.min([train_mini, test_mini])
    
    roll_rate_list = np.arange(0, 200, 20)
    pred_list = []
    tta_pred_list = []
    
    for roll_rate in roll_rate_list:
        
        print(f'\n\n -------------------------------- data preprocessing - roll point: {roll_rate} -------------------------------- \n\n')
        
        train_roll = shifting_sound(train_x, roll_rate=roll_rate)
        test_roll = shifting_sound(test_x, roll_rate=roll_rate)
        
        train_roll = set_length(train_roll, mini)
        test_roll = set_length(test_roll, mini)
        
        print("make MFCC")
        train_mfcc = get_feature(data = train_roll)
        test_mfcc = get_feature(data = test_roll)
        
        train_mfcc = np.concatenate((np.max(train_mfcc, axis=2),
                                    np.min(train_mfcc, axis=2),
                                    np.mean(train_mfcc, axis=2),
                                    np.var(train_mfcc, axis=2),
                                    skew(train_mfcc, axis=2),
                                    np.std(train_mfcc, axis=2)), axis=1)


        test_mfcc = np.concatenate((np.max(test_mfcc, axis=2),
                                   np.min(test_mfcc, axis=2),
                                   np.mean(test_mfcc, axis=2),
                                   np.var(test_mfcc, axis=2),
                                   skew(test_mfcc, axis=2),
                                   np.std(test_mfcc, axis=2)), axis=1)

        

        print("train shape :", train_mfcc.shape)
        print("test shape :", test_mfcc.shape)
        
        print(f'\n\n -------------------------------- train for roll point {roll_rate} -------------------------------- \n\n')
        
        pred = build_xgb(5, train_mfcc, train_y, test_mfcc)
        pred_list.append(pred)
        
        gc.collect()
        
    return pred_list

In [10]:
%%time

africa_train_data = np.load("./npy_data/africa_npy.npy", allow_pickle = True)
australia_train_data = np.load("./npy_data/australia_npy.npy", allow_pickle = True)
canada_train_data = np.load("./npy_data/canada_npy.npy", allow_pickle = True)
england_train_data = np.load("./npy_data/england_npy.npy", allow_pickle = True)
hongkong_train_data = np.load("./npy_data/hongkong_npy.npy", allow_pickle = True)
us_train_data = np.load("./npy_data/us_npy.npy", allow_pickle = True)

test_data = np.load("./npy_data/test_npy.npy", allow_pickle = True)

train_data_list = [africa_train_data, australia_train_data, canada_train_data, england_train_data, hongkong_train_data, us_train_data]

Wall time: 13.1 s


In [11]:
train_y = np.concatenate((np.zeros(len(africa_train_data), dtype = np.int),
                         np.ones(len(australia_train_data), dtype = np.int),
                         np.ones(len(canada_train_data), dtype = np.int) * 2,
                         np.ones(len(england_train_data), dtype = np.int) * 3,
                         np.ones(len(hongkong_train_data), dtype = np.int) * 4,
                         np.ones(len(us_train_data), dtype = np.int) * 5), axis = 0)

In [12]:
pred_list = train_xgb(train_data_list, test_data, train_y)



 -------------------------------- data preprocessing - roll point: 0 -------------------------------- 




  0%|                                                                                | 4/25520 [00:00<12:00, 35.40it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [03:30<00:00, 121.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 6100/6100 [01:04<00:00, 94.24it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 0 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.51512	valid-mlogloss:1.56665
[20]	train-mlogloss:0.55719	valid-mlogloss:0.99399
[40]	train-mlogloss:0.36362	valid-mlogloss:0.96475
[60]	train-mlogloss:0.25688	valid-mlogloss:0.96083
[73]	train-mlogloss:0.20794	valid-mlogloss:0.96365

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.51234	valid-mlogloss:1.56872
[20]	train-mlogloss:0.54255	valid-mlogloss:1.01438
[40]	train-mlogloss:0.35290	valid-mlogloss:0.99264
[50]	train-mlogloss:0.29663	valid-mlogloss:0.99430

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.50727	valid-mlogloss:1.56723
[20]	train-mlogloss:0.54201	valid-mlogloss:0.99767
[40]	train-mlogloss:0.35113	valid-mlogloss:0.97595
[60]

  0%|                                                                               | 10/25520 [00:00<04:22, 97.09it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [03:59<00:00, 106.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:58<00:00, 104.07it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 20 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52854	valid-mlogloss:1.58126
[20]	train-mlogloss:0.62625	valid-mlogloss:1.08892
[40]	train-mlogloss:0.41007	valid-mlogloss:1.06349
[60]	train-mlogloss:0.28627	valid-mlogloss:1.06209

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52988	valid-mlogloss:1.58369
[20]	train-mlogloss:0.62294	valid-mlogloss:1.08234
[40]	train-mlogloss:0.40536	valid-mlogloss:1.05760
[60]	train-mlogloss:0.28024	valid-mlogloss:1.04884
[65]	train-mlogloss:0.25929	valid-mlogloss:1.05088

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52918	valid-mlogloss:1.57899
[20]	train-mlogloss:0.63815	valid-mlogloss:1.08712
[40]	train-mlogloss:0.42673	valid-mlogloss:1.05832
[59

  0%|                                                                              | 11/25520 [00:00<04:03, 104.76it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:03<00:00, 104.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:58<00:00, 104.62it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 40 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53106	valid-mlogloss:1.58014
[20]	train-mlogloss:0.61555	valid-mlogloss:1.04572
[40]	train-mlogloss:0.41892	valid-mlogloss:1.02377
[60]	train-mlogloss:0.29287	valid-mlogloss:1.01804
[69]	train-mlogloss:0.25083	valid-mlogloss:1.01900

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53083	valid-mlogloss:1.58324
[20]	train-mlogloss:0.60631	valid-mlogloss:1.05976
[40]	train-mlogloss:0.40897	valid-mlogloss:1.05047
[60]	train-mlogloss:0.28523	valid-mlogloss:1.04569
[67]	train-mlogloss:0.25209	valid-mlogloss:1.04681

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52973	valid-mlogloss:1.58090
[20]	train-mlogloss:0.62067	valid-mlogloss:1.06342
[40

  0%|                                                                               | 10/25520 [00:00<04:21, 97.52it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:06<00:00, 103.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:57<00:00, 105.91it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 60 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52781	valid-mlogloss:1.58200
[20]	train-mlogloss:0.60559	valid-mlogloss:1.08384
[40]	train-mlogloss:0.40503	valid-mlogloss:1.06538
[60]	train-mlogloss:0.27501	valid-mlogloss:1.06593

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52774	valid-mlogloss:1.58085
[20]	train-mlogloss:0.59799	valid-mlogloss:1.07803
[40]	train-mlogloss:0.40657	valid-mlogloss:1.05550
[60]	train-mlogloss:0.28782	valid-mlogloss:1.05370
[67]	train-mlogloss:0.25557	valid-mlogloss:1.05331

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53381	valid-mlogloss:1.58529
[20]	train-mlogloss:0.61713	valid-mlogloss:1.06499
[40]	train-mlogloss:0.41382	valid-mlogloss:1.03959
[60

  0%|                                                                              | 11/25520 [00:00<04:02, 105.25it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:02<00:00, 105.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:57<00:00, 106.88it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 80 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52394	valid-mlogloss:1.57854
[20]	train-mlogloss:0.56123	valid-mlogloss:1.03554
[40]	train-mlogloss:0.37514	valid-mlogloss:1.01267
[60]	train-mlogloss:0.26304	valid-mlogloss:1.00734
[68]	train-mlogloss:0.23330	valid-mlogloss:1.00948

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52247	valid-mlogloss:1.58051
[20]	train-mlogloss:0.56751	valid-mlogloss:1.03858
[40]	train-mlogloss:0.37470	valid-mlogloss:1.01279
[60]	train-mlogloss:0.25996	valid-mlogloss:1.00757
[66]	train-mlogloss:0.23641	valid-mlogloss:1.01029

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52243	valid-mlogloss:1.57805
[20]	train-mlogloss:0.57838	valid-mlogloss:1.03839
[40

  0%|                                                                              | 11/25520 [00:00<04:01, 105.77it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:02<00:00, 105.19it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:56<00:00, 108.16it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 100 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52909	valid-mlogloss:1.58440
[20]	train-mlogloss:0.58529	valid-mlogloss:1.05476
[40]	train-mlogloss:0.38253	valid-mlogloss:1.03254
[60]	train-mlogloss:0.26095	valid-mlogloss:1.02846
[61]	train-mlogloss:0.25702	valid-mlogloss:1.02897

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52190	valid-mlogloss:1.58036
[20]	train-mlogloss:0.58511	valid-mlogloss:1.05740
[40]	train-mlogloss:0.39582	valid-mlogloss:1.03855
[55]	train-mlogloss:0.29930	valid-mlogloss:1.03681

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52826	valid-mlogloss:1.58133
[20]	train-mlogloss:0.59367	valid-mlogloss:1.05032
[40]	train-mlogloss:0.38939	valid-mlogloss:1.03242
[6

  0%|                                                                              | 11/25520 [00:00<03:56, 107.84it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:01<00:00, 105.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:57<00:00, 106.43it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 120 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53686	valid-mlogloss:1.59157
[20]	train-mlogloss:0.64531	valid-mlogloss:1.11835
[40]	train-mlogloss:0.42780	valid-mlogloss:1.10012
[60]	train-mlogloss:0.29674	valid-mlogloss:1.10315

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53685	valid-mlogloss:1.58473
[20]	train-mlogloss:0.66188	valid-mlogloss:1.09992
[40]	train-mlogloss:0.44991	valid-mlogloss:1.07516
[59]	train-mlogloss:0.31184	valid-mlogloss:1.06964

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53882	valid-mlogloss:1.59048
[20]	train-mlogloss:0.64007	valid-mlogloss:1.10366
[40]	train-mlogloss:0.42125	valid-mlogloss:1.07556
[60]	train-mlogloss:0.29587	valid-mlogloss:1.07199
[6

  0%|                                                                              | 11/25520 [00:00<04:05, 103.77it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:06<00:00, 103.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:56<00:00, 107.68it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 140 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52864	valid-mlogloss:1.57858
[20]	train-mlogloss:0.60560	valid-mlogloss:1.06377
[40]	train-mlogloss:0.41704	valid-mlogloss:1.04842
[60]	train-mlogloss:0.29127	valid-mlogloss:1.04656
[62]	train-mlogloss:0.28140	valid-mlogloss:1.04787

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52974	valid-mlogloss:1.58310
[20]	train-mlogloss:0.60714	valid-mlogloss:1.05617
[40]	train-mlogloss:0.40561	valid-mlogloss:1.04063
[60]	train-mlogloss:0.28383	valid-mlogloss:1.03786
[61]	train-mlogloss:0.27959	valid-mlogloss:1.03906

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52603	valid-mlogloss:1.57982
[20]	train-mlogloss:0.60236	valid-mlogloss:1.06587
[4

  0%|                                                                              | 11/25520 [00:00<04:03, 104.76it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:05<00:00, 104.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:56<00:00, 107.94it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 160 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.53257	valid-mlogloss:1.58519
[20]	train-mlogloss:0.62425	valid-mlogloss:1.08472
[40]	train-mlogloss:0.42051	valid-mlogloss:1.06421
[60]	train-mlogloss:0.29212	valid-mlogloss:1.05971
[73]	train-mlogloss:0.23967	valid-mlogloss:1.06297

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52976	valid-mlogloss:1.58179
[20]	train-mlogloss:0.61078	valid-mlogloss:1.06778
[40]	train-mlogloss:0.40015	valid-mlogloss:1.04506
[60]	train-mlogloss:0.27520	valid-mlogloss:1.04277
[63]	train-mlogloss:0.26111	valid-mlogloss:1.04591

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52972	valid-mlogloss:1.58267
[20]	train-mlogloss:0.61376	valid-mlogloss:1.07795
[4

  0%|                                                                              | 11/25520 [00:00<04:02, 105.21it/s]

make MFCC


100%|███████████████████████████████████████████████████████████████████████████| 25520/25520 [04:02<00:00, 105.42it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 6100/6100 [00:56<00:00, 107.56it/s]


train shape : (25520, 192)
test shape : (6100, 192)


 -------------------------------- train for roll point 180 -------------------------------- 



 -------------------------------- 1 번째 fold -------------------------------- 

[0]	train-mlogloss:1.51697	valid-mlogloss:1.57177
[20]	train-mlogloss:0.56020	valid-mlogloss:1.01905
[40]	train-mlogloss:0.36610	valid-mlogloss:0.99335
[49]	train-mlogloss:0.31476	valid-mlogloss:0.99489

 -------------------------------- 2 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52470	valid-mlogloss:1.57378
[20]	train-mlogloss:0.54893	valid-mlogloss:1.01816
[40]	train-mlogloss:0.35380	valid-mlogloss:0.98694
[60]	train-mlogloss:0.25313	valid-mlogloss:0.98722
[65]	train-mlogloss:0.23197	valid-mlogloss:0.98795

 -------------------------------- 3 번째 fold -------------------------------- 

[0]	train-mlogloss:1.52492	valid-mlogloss:1.58193
[20]	train-mlogloss:0.56331	valid-mlogloss:1.03537
[40]	train-mlogloss:0.37429	valid-mlogloss:1.00884
[5

In [13]:
np.save("./npy_data/pred_list", pred_list)
pred_list = np.load("./npy_data/pred_list.npy", allow_pickle = True)

In [14]:
def cov_type(data):
    return np.int(data)

# 처음에 살펴본 것처럼 glob로 test data의 path는 sample_submission의 id와 같이 1,2,3,4,5.....으로 정렬 되어있지 않습니다.
# 만들어둔 test_ 데이터프레임을 이용하여 sample_submission과 predict값의 id를 맞춰줍니다.

sample_submission = pd.read_csv(path + "sample_submission.csv")

result = pd.concat([test_, pd.DataFrame(np.mean(pred_list, axis = 0))], axis = 1).iloc[:, 1:]
result["id"] = result["id"].apply(lambda x : cov_type(x))

result = pd.merge(sample_submission["id"], result)
result.columns = sample_submission.columns

In [15]:
result.to_csv("submission_feat.csv", index = False)
result

Unnamed: 0,id,africa,australia,canada,england,hongkong,us
0,1,0.064139,0.047025,0.025030,0.438492,0.010508,0.414806
1,2,0.086843,0.020491,0.013309,0.383695,0.047297,0.448365
2,3,0.066066,0.034864,0.010287,0.451211,0.017426,0.420147
3,4,0.062990,0.009682,0.011738,0.425696,0.020099,0.469794
4,5,0.052997,0.007163,0.011789,0.477477,0.023594,0.426980
...,...,...,...,...,...,...,...
6095,6096,0.116724,0.017868,0.040320,0.319887,0.013645,0.491556
6096,6097,0.047700,0.059884,0.009217,0.371258,0.010543,0.501399
6097,6098,0.145700,0.016355,0.012443,0.431675,0.024375,0.369452
6098,6099,0.061985,0.013048,0.022722,0.554238,0.017532,0.330475


In [16]:
# https://dacon.io/competitions/official/235689/codeshare/2399?page=2&dtype=recent