# Montly DACON Machine Fault Diagnosis
## -  Anomaly Detection

## Library Import

In [2]:
# ! pip install librosa

Collecting librosa
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
Collecting soundfile>=0.10.2
  Downloading soundfile-0.11.0-py2.py3-none-win_amd64.whl (1.0 MB)
Collecting resampy>=0.2.2
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
Collecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
Collecting audioread>=2.1.9
  Downloading audioread-3.0.0.tar.gz (377 kB)
Building wheels for collected packages: audioread
  Building wheel for audioread (setup.py): started
  Building wheel for audioread (setup.py): finished with status 'done'
  Created wheel for audioread: filename=audioread-3.0.0-py3-none-any.whl size=23706 sha256=1cfca924a3e5b22fd27cdeccda3afac08ae58f0500800251bcf29ef5a6d363eb
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\e4\76\a4\cfb55573167a1f5bde7d7a348e95e509c64b2c3e8f921932c3
Successfully built audioread
Installing collected packages: soundfile, resampy, pooch, audioread, librosa
Successfully installed audioread-3.0.0 lib

In [3]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
CFG = {
    'SR':16000,
    'N_MFCC':128, # MFCC 벡터를 추출할 개수 (<=128)
    'SEED':41
}

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [6]:
train_df = pd.read_csv('./train.csv') # 모두 정상 Sample
test_df = pd.read_csv('./test.csv')

In [34]:
test_df

Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE
0,TEST_0000,./test/TEST_0000.wav,2
1,TEST_0001,./test/TEST_0001.wav,2
2,TEST_0002,./test/TEST_0002.wav,0
3,TEST_0003,./test/TEST_0003.wav,0
4,TEST_0004,./test/TEST_0004.wav,0
...,...,...,...
1509,TEST_1509,./test/TEST_1509.wav,0
1510,TEST_1510,./test/TEST_1510.wav,2
1511,TEST_1511,./test/TEST_1511.wav,0
1512,TEST_1512,./test/TEST_1512.wav,0


In [47]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    return features

In [48]:
train_features = get_mfcc_feature(train_df)
test_features = get_mfcc_feature(test_df)

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [49]:
model  = IsolationForest(n_estimators = 200, max_samples = 256, contamination = 'auto',
                        random_state= CFG['SEED'], verbose = 0)
model.fit(train_features)

In [50]:
test_pred_proba = model.decision_function(test_features)

In [51]:
threshold = 0.05
pred = (test_pred_proba < threshold)*1

In [52]:
pred.mean()

0.6453104359313078

In [53]:
submit = pd.read_csv('./sample_submission.csv')
submit

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1509,TEST_1509,0
1510,TEST_1510,0
1511,TEST_1511,0
1512,TEST_1512,0


In [54]:
pd.DataFrame(pred)

Unnamed: 0,0
0,1
1,0
2,1
3,1
4,1
...,...
1509,1
1510,1
1511,0
1512,0


In [55]:
submit['LABEL'] = pred

In [56]:
submit.to_csv('./baseline-{threshold}.csv',index=False)