In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
import multiprocessing as mp
from sklearn.model_selection import train_test_split, KFold
from tqdm.auto import tqdm
import librosa

In [2]:
train_data = pd.read_csv('/kaggle/input/tinkoff/train_gt.csv',header=None)
test_data = pd.read_csv('/kaggle/input/tinkoff/test-10.csv',header=None)
train_data.columns = test_data.columns = ['id','label']

In [3]:
train_data['path'] = '/kaggle/input/tinkoff/train-4/train/' + train_data['id']
test_data['path'] = '/kaggle/input/tinkoff/test-5/test/' + test_data['id']

In [32]:
def get_mlcc_feats(path):
    arr,sr = librosa.load(path)
    mfccs = librosa.feature.mfcc(y=arr, sr=sr, n_mfcc = 256, hop_length=512)
    return np.concatenate([mfccs.mean(axis=1),mfccs.std(axis=1)])

In [33]:
train_feats = np.stack([get_mlcc_feats(x) for x in tqdm(train_data['path'])])
test_feats = np.stack([get_mlcc_feats(x) for x in tqdm(test_data['path'])])

  0%|          | 0/8803 [00:00<?, ?it/s]

  0%|          | 0/2870 [00:00<?, ?it/s]

In [34]:
train_feat, eval_feat, train_label, eval_label = train_test_split(train_feats,train_data['label'],test_size=0.2,random_state=56,shuffle=True)

train_pool = Pool(
    data=train_feat,
    label=train_label
)

eval_pool = Pool(
    data=eval_feat,
    label=eval_label
)

test_pool = Pool(
    data=test_feats,
    label=None
)

In [37]:
params = {
    'iterations':1000,
    'learning_rate':0.005,
    'loss_function':'Logloss',
    'class_weights':[0.367375, 0.632625],
    'max_depth':4,
    'random_seed':56,
    'eval_metric':'F1'
}

In [38]:
cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.4412792	test: 0.4401581	best: 0.4401581 (0)	total: 18.8ms	remaining: 18.8s
100:	learn: 0.5829154	test: 0.5800773	best: 0.5822762 (98)	total: 1.92s	remaining: 17.1s
200:	learn: 0.5881596	test: 0.5813864	best: 0.5885688 (178)	total: 3.81s	remaining: 15.1s
300:	learn: 0.5938411	test: 0.5813333	best: 0.5885688 (178)	total: 5.68s	remaining: 13.2s
400:	learn: 0.6024689	test: 0.5842416	best: 0.5885688 (178)	total: 7.57s	remaining: 11.3s
500:	learn: 0.6083424	test: 0.5835678	best: 0.5885688 (178)	total: 9.42s	remaining: 9.39s
600:	learn: 0.6170961	test: 0.5815436	best: 0.5885688 (178)	total: 11.4s	remaining: 7.54s
700:	learn: 0.6238101	test: 0.5836920	best: 0.5885688 (178)	total: 13.5s	remaining: 5.78s
800:	learn: 0.6326315	test: 0.5813816	best: 0.5885688 (178)	total: 15.4s	remaining: 3.82s
900:	learn: 0.6388903	test: 0.5801901	best: 0.5885688 (178)	total: 17.2s	remaining: 1.89s
999:	learn: 0.6447443	test: 0.5804030	best: 0.5885688 (178)	total: 19.1s	remaining: 0us

bestTest = 0.58

<catboost.core.CatBoostClassifier at 0x7ab2fc483820>

In [29]:
cbm.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,0,28.636214
1,102,9.936913
2,99,6.526650
3,35,3.954156
4,14,3.859519
...,...,...
123,113,0.000000
124,119,0.000000
125,121,0.000000
126,122,0.000000


In [30]:
test_data['label'] = cbm.predict(test_feats)

In [31]:
test_data[['id','label']].to_csv('simple_mccn_meanV2.csv',index=False,header=None)