In [2]:
import librosa

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random

from torch import nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier

import torch
import torchmetrics
import os

import pickle

In [3]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
device = device = torch.device("mps")

In [6]:
class Config:
    SR = 32000
    N_MFCC = 13
    # Dataset
    ROOT_FOLDER = './'
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 5
    LR = 3e-4
    # Others
    SEED = 42

CONFIG = Config()

In [7]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,id,path,label
0,RUNQPNJF,./train/RUNQPNJF.ogg,real
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake
2,RDKEKEVX,./train/RDKEKEVX.ogg,real
3,QYHJDOFK,./train/QYHJDOFK.ogg,real
4,RSPQNHAO,./train/RSPQNHAO.ogg,real


In [8]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED)

In [9]:
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['label']
            label_vector = 0 if label == 'fake' else 1
            #label_vector[0 if label == 'fake' else 1] = 1
            labels.append(label_vector)

    if train_mode:
        return features, labels
    return features

In [10]:
train_mfcc, train_labels = get_mfcc_feature(train, True)
val_mfcc, val_labels = get_mfcc_feature(val, True)

44350it [06:29, 113.88it/s]
11088it [01:29, 124.34it/s]


In [11]:
if not os.path.exists('./mfcc'):
    os.makedirs('./mfcc', exist_ok=True)

np.save('./mfcc/train_mfcc.npy', train_mfcc)
np.save('./mfcc/train_labels.npy', train_labels)
np.save('./mfcc/val_mfcc.npy', val_mfcc)
np.save('./mfcc/val_labels.npy', val_labels)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

model = RandomForestClassifier(n_estimators=100, random_state=CONFIG.SEED)
kf = KFold(n_splits=5, random_state=CONFIG.SEED, shuffle=True)

print(model)
print(kf.get_n_splits(train_mfcc))

RandomForestClassifier(random_state=42)
5


In [29]:
print(pd.DataFrame(train_mfcc))

               0           1          2          3          4          5   \
0     -248.033279  150.792999 -35.178806  59.451466  10.164450   9.027971   
1     -323.560699  136.979355 -45.662029  23.387531  15.381805  -6.980533   
2     -311.442993  135.454391 -23.391869  22.338968 -10.303095 -23.955645   
3     -217.945770  168.685394 -24.134439  60.136875  12.705692  15.668672   
4     -246.821243  173.011795 -23.041840  52.638878   2.757993  11.924505   
...           ...         ...        ...        ...        ...        ...   
44345 -310.470673  217.360565 -19.847445  -7.339335  23.072023 -14.942649   
44346 -281.431183  127.880753 -34.482655  36.695858 -13.862648 -21.928667   
44347 -272.382874  140.753525  -8.920831  53.356968  11.156157  11.615736   
44348 -297.979004  194.519455 -27.022598  42.725582  -1.917439 -13.344779   
44349 -223.302826  168.091888  -9.389204  35.624546 -12.929810  -1.581608   

              6          7          8          9          10         11  \


In [31]:
import time
from sklearn.metrics import accuracy_score
start = time.time()
acc = []

for train_index, test_index in kf.split(train_mfcc):
    X_train, X_test = np.array(train_mfcc)[train_index], np.array(train_mfcc)[test_index]
    y_train, y_test = np.array(train_labels)[train_index], np.array(train_labels)[test_index]

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print('Acc : ', accuracy_score(y_test, pred))
    acc.append(accuracy_score(y_test, pred))

end = time.time()
timeTaken = (end - start)
print("Model trained in: " + str( round(timeTaken, 2) ) + " seconds.")


Acc :  0.9613303269447576
Acc :  0.9591882750845547
Acc :  0.9556933483652762
Acc :  0.9595264937993235
Acc :  0.9620067643742953
Model trained in: 36.29 seconds.


## Parameter tuning

In [32]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 50,100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = CONFIG.SEED)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5)
grid_cv.fit(train_mfcc, train_labels )

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.9430


In [33]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [75,100, 125, 150],
           'max_depth' : [11,12,13,14],
           'min_samples_leaf' : [6,7,8,9],
           'min_samples_split' : [18,20,22,24]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = CONFIG.SEED)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs=-1)
grid_cv.fit(train_mfcc, train_labels )

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 14, 'min_samples_leaf': 8, 'min_samples_split': 18, 'n_estimators': 150}
최고 예측 정확도: 0.9489


In [None]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [150, 180],
           'max_depth' : [14,15],
           'min_samples_leaf' : [7,8],
           'min_samples_split' : [17,18]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = CONFIG.SEED)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, verbose=2)
grid_cv.fit(train_mfcc, train_labels )

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
'''
최적 하이퍼 파라미터:  {'max_depth': 15, 'min_samples_leaf': 7, 'min_samples_split': 17, 'n_estimators': 150}
최고 예측 정확도: 0.9511
'''

In [42]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [148,150,152],
           'max_depth' : [15,16],
           'min_samples_leaf' : [6,7,8],
           'min_samples_split' : [16,17,18]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = CONFIG.SEED)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, verbose=2)
grid_cv.fit(train_mfcc, train_labels )

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
'''
최적 하이퍼 파라미터:  {'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 16, 'n_estimators': 150}
최고 예측 정확도: 0.9526
'''

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=148; total time=   9.7s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=148; total time=   9.7s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=148; total time=   9.7s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=148; total time=   9.9s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=148; total time=   9.8s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=150; total time=  10.0s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=150; total time=   9.9s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=150; total time=   9.9s
[CV] END max_depth=15, min_samples_leaf=6, min_samples_split=16, n_estimators=150; total time=   9.9s
[CV] END max_depth=1

"\n최적 하이퍼 파라미터:  {'max_depth': 15, 'min_samples_leaf': 7, 'min_samples_split': 17, 'n_estimators': 150}\n최고 예측 정확도: 0.9511\n"

In [59]:
len(train_mfcc)

44350

In [4]:
'''
최적 하이퍼 파라미터:  {'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 16, 'n_estimators': 150}
최고 예측 정확도: 0.9526
'''

rf_clf1 = RandomForestClassifier(n_estimators = 150, 
                                max_depth = 16,
                                min_samples_leaf = 6,
                                min_samples_split = 16,
                                random_state =CONFIG.SEED,
                                n_jobs = -1)
rf_clf1.fit(train_mfcc, train_labels)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,pred)))# 이상함 수정해야함

NameError: name 'CONFIG' is not defined

In [None]:
#pred = rf_clf1.predict(val_mfcc)
test = pd.read_csv('./test.csv')
test_mfcc = get_mfcc_feature(test, False)
print(test_mfcc)
'''
best_rf_clf = grid_cv.best_estimator_

# 각 클래스의 확률을 예측
probs = best_rf_clf.predict_proba(val_mfcc)

print((probs))
#print('예측 정확도: {:.4f}'.format(accuracy_score(val_labels,probs)))
'''

In [63]:
best_rf_clf = grid_cv.best_estimator_

probs = best_rf_clf.predict_proba(test_mfcc)

print(probs)

[[0.55894033 0.44105967]
 [0.56468315 0.43531685]
 [0.71519872 0.28480128]
 ...
 [0.35262822 0.64737178]
 [0.0727997  0.9272003 ]
 [0.68620256 0.31379744]]


In [64]:
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = probs
submit.head()

Unnamed: 0,id,fake,real
0,TEST_00000,0.55894,0.44106
1,TEST_00001,0.564683,0.435317
2,TEST_00002,0.715199,0.284801
3,TEST_00003,0.048644,0.951356
4,TEST_00004,0.100908,0.899092


In [65]:
submit.to_csv('./RandomForest_submit.csv', index=False)