In [23]:
import librosa
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:

class Config:
    SR = 32000
    N_MFCC = 13
    N_CLASSES = 2
    SEED = 42

CONFIG = Config()

In [12]:

def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['label']
            labels.append(label)

    if train_mode:
        return np.array(features), np.array(labels)
    return np.array(features)


In [16]:
# 데이터 로드
df = pd.read_csv('./train.csv')
train, val = train_test_split(df, test_size=0.2, random_state=CONFIG.SEED)
train_mfcc, train_labels = get_mfcc_feature(train, True)
val_mfcc, val_labels = get_mfcc_feature(val, True)

44350it [09:10, 80.59it/s] 
11088it [02:15, 81.84it/s]


In [14]:
train_mfcc = np.load('./mfcc/train_mfcc.npy')
train_labels = np.load('./mfcc/train_labels.npy')
val_labels = np.load('./mfcc/val_labels.npy')
val_mfcc = np.load('./mfcc/val_mfcc.npy')

In [18]:
if not os.path.exists('./mfcc_multilabel'):
    os.makedirs('./mfcc_multilabel', exist_ok=True)

np.save('./mfcc_multilabel/train_mfcc.npy', train_mfcc)
np.save('./mfcc_multilabel/train_labels.npy', train_labels)
np.save('./mfcc_multilabel/val_mfcc.npy', val_mfcc)
np.save('./mfcc_multilabel/val_labels.npy', val_labels)

In [17]:
train_labels_multilabel = np.array([[1, 1] if lbl == 'both' else [1, 0] if lbl == 'real' else [0, 1] if lbl == 'fake' else [0, 0] for lbl in train_labels])
val_labels_multilabel = np.array([[1, 1] if lbl == 'both' else [1, 0] if lbl == 'real' else [0, 1] if lbl == 'fake' else [0, 0] for lbl in val_labels])

print(train_labels_multilabel)
print(val_labels_multilabel)

[[0 1]
 [0 1]
 [1 0]
 ...
 [0 1]
 [1 0]
 [0 1]]
[[1 0]
 [0 1]
 [0 1]
 ...
 [1 0]
 [0 1]
 [1 0]]


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'estimator__n_estimators': [10, 50, 90],
    'estimator__max_depth': [15, 16],
    'estimator__min_samples_leaf': [6, 7, 8],
    'estimator__min_samples_split': [16, 17, 18]
}

# 랜덤 포레스트 모델을 멀티 아웃풋 클래시파이어로 감쌉니다
rf_clf = RandomForestClassifier(random_state=CONFIG.SEED)
multi_target_rf = MultiOutputClassifier(rf_clf, n_jobs=-1)

# 하이퍼파라미터 튜닝
grid_cv = GridSearchCV(multi_target_rf, param_grid=params, cv=5, verbose=2)
grid_cv.fit(train_mfcc, train_labels_multilabel)

print('최적 하이퍼파라미터: ', grid_cv.best_params_)
print('최고 CV 점수: {:.4f}'.format(grid_cv.best_score_))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=10; total time=   2.7s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=10; total time=   1.4s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=10; total time=   1.0s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=10; total time=   1.2s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=10; total time=   1.1s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, estimator__min_samples_split=16, estimator__n_estimators=50; total time=   4.0s
[CV] END estimator__max_depth=15, estimator__min_samples_leaf=6, est

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'estimator__n_estimators': [90, 130],
    'estimator__max_depth': [16,17],
    'estimator__min_samples_leaf': [5,6],
    'estimator__min_samples_split': [15,16]
}

# 랜덤 포레스트 모델을 멀티 아웃풋 클래시파이어로 감쌉니다
rf_clf = RandomForestClassifier(random_state=CONFIG.SEED)
multi_target_rf = MultiOutputClassifier(rf_clf, n_jobs=-1)

# 하이퍼파라미터 튜닝
grid_cv = GridSearchCV(multi_target_rf, param_grid=params, cv=5, verbose=2)
grid_cv.fit(train_mfcc, train_labels_multilabel)

print('최적 하이퍼파라미터: ', grid_cv.best_params_)
print('최고 CV 점수: {:.4f}'.format(grid_cv.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=90; total time=   7.5s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=90; total time=   7.0s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=90; total time=   7.1s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=90; total time=   7.1s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=90; total time=   8.2s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, estimator__min_samples_split=15, estimator__n_estimators=130; total time=  10.6s
[CV] END estimator__max_depth=16, estimator__min_samples_leaf=5, est

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'estimator__n_estimators': [130,150],
    'estimator__max_depth': [17, 18],
    'estimator__min_samples_leaf': [4,5,6],
    'estimator__min_samples_split': [16,17]
}

# 랜덤 포레스트 모델을 멀티 아웃풋 클래시파이어로 감쌉니다
rf_clf = RandomForestClassifier(random_state=CONFIG.SEED)
multi_target_rf = MultiOutputClassifier(rf_clf, n_jobs=-1)

# 하이퍼파라미터 튜닝
grid_cv = GridSearchCV(multi_target_rf, param_grid=params, cv=5, verbose=2)
grid_cv.fit(train_mfcc, train_labels_multilabel)

print('최적 하이퍼파라미터: ', grid_cv.best_params_)
print('최고 CV 점수: {:.4f}'.format(grid_cv.best_score_))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=130; total time=  12.9s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=130; total time=  10.8s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=130; total time=  10.8s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=130; total time=  10.8s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=130; total time=  10.3s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=4, estimator__min_samples_split=16, estimator__n_estimators=150; total time=  13.0s
[CV] END estimator__max_depth=17, estimator__min_samples_leaf=

In [34]:
test = pd.read_csv('./test.csv')
test_mfcc = get_mfcc_feature(test, False)

50000it [07:13, 115.39it/s]


In [35]:
best_rf_clf = grid_cv.best_estimator_

# 각 클래스의 확률을 예측
probs = best_rf_clf.predict_proba(test_mfcc)

# 예측 클래스 (0 또는 1) 계산
pred = best_rf_clf.predict(test_mfcc)


In [28]:
results_df = pd.DataFrame({
    'prob_fake_0': probs[0][:, 0],
    'prob_fake_1': probs[0][:, 1],
    'prob_real_0': probs[1][:, 0],
    'prob_real_1': probs[1][:, 1],
    'pred_fake': pred[:, 0],
    'pred_real': pred[:, 1],
    'actual_fake': val_labels_multilabel[:, 0],
    'actual_real': val_labels_multilabel[:, 1]
})
results_df

Unnamed: 0,prob_fake_0,prob_fake_1,prob_real_0,prob_real_1,pred_fake,pred_real,actual_fake,actual_real
0,0.061198,0.938802,0.938802,0.061198,1,0,1,0
1,0.987392,0.012608,0.012608,0.987392,0,1,0,1
2,0.484532,0.515468,0.515468,0.484532,1,0,0,1
3,0.041398,0.958602,0.958602,0.041398,1,0,1,0
4,0.569209,0.430791,0.430791,0.569209,0,1,1,0
...,...,...,...,...,...,...,...,...
11083,0.858594,0.141406,0.141406,0.858594,0,1,0,1
11084,0.952692,0.047308,0.047308,0.952692,0,1,0,1
11085,0.324467,0.675533,0.675533,0.324467,1,0,1,0
11086,0.982907,0.017093,0.017093,0.982907,0,1,0,1


In [42]:
probs[0]

array([[0.57625626, 0.42374374],
       [0.54465857, 0.45534143],
       [0.66238039, 0.33761961],
       ...,
       [0.34047623, 0.65952377],
       [0.12328127, 0.87671873],
       [0.71907992, 0.28092008]])

In [43]:
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = np.hstack((probs[0][:, 1].reshape(-1, 1), probs[1][:, 1].reshape(-1, 1)))
submit.head()

Unnamed: 0,id,fake,real
0,TEST_00000,0.423744,0.576256
1,TEST_00001,0.455341,0.544659
2,TEST_00002,0.33762,0.66238
3,TEST_00003,0.929272,0.070728
4,TEST_00004,0.878313,0.121687


In [44]:
submit.to_csv('./RandomForest_MultiLabel_submit.csv', index=False)