In [None]:
import pyforest
from glob import glob

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, AveragePooling1D, LSTM, Bidirectional
from keras.utils import np_utils, to_categorical
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau, EarlyStopping, CSVLogger, ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import RMSprop

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, make_scorer, roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
import xgboost as xgb

from imblearn.combine import SMOTETomek
from sklearn.datasets import make_classification

import os
from keras.utils.vis_utils import plot_model

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

# to play the audio files and show the audio files
from IPython.display import Audio
import IPython.display as ipd

import warnings
warnings.filterwarnings('ignore')

In [None]:
label = pd.read_csv('label.csv')

# Fungsi untuk menghasilkan jalur ke file berdasarkan file_id
def get_sound_file_path(file_id):
    folder_path = 'sounds'  # Ganti dengan jalur folder suara
    file_extension = '.wav'  # Ganti dengan ekstensi file yang sesuai

    file_name = str(file_id) + file_extension
    file_path = os.path.join(folder_path, file_name)
    return file_path

# Menambahkan kolom baru ke DataFrame yang berisi jalur file suara
label['file_path'] = label['file_id'].apply(get_sound_file_path)

# Mengganti backslash (\) dengan slash (/) pada jalur
label['file_path'] = label['file_path'].str.replace('\\', '/')

In [None]:
def create_waveplot(data, sr, ):
    plt.figure(figsize=(6, 3))
    plt.title('Waveplot for audio with emotion', size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()

def create_spectrogram(data, sr):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(6, 3))
    plt.title('Spectrogram for audio with emotion', size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz') 
    plt.ylabel("Frequency")
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

def create_spectrum(data, sr):
    fft_normal = np.fft.fft(data) #fast Fourier transform
    magnitude_normal = np.abs(fft_normal)
    freq_normal = np.linspace(0,sr, len(magnitude_normal)) 
    half_freq = freq_normal[:int(len(freq_normal)/2)]
    half_magnitude = magnitude_normal[:int(len(freq_normal)/2)]

    plt.figure(figsize=(6,3))
    plt.plot(half_freq,half_magnitude)
    plt.title('Spectrum for audio with emotion', size=15)
    plt.xlabel("Frequency")
    plt.ylabel("Magnitude")
    plt.show()

In [None]:
path = np.array(label.file_path[label.split=='valid'])[10]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate)
create_spectrogram(data, sampling_rate)
create_spectrum(data, sampling_rate)
Audio(path)

In [None]:
path = np.array(label.file_path[label.split=='valid'])[358]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate)
create_spectrogram(data, sampling_rate)
create_spectrum(data, sampling_rate)
Audio(path)

In [None]:
label_train = label[(label['split'] == 'train') | (label['split'] == 'valid')]
label_test = label[label['split'] == 'test']

In [None]:
label_train['abnormal'].value_counts()

In [None]:
plt.title('Count of Anomalies in Manufacturing Plants', size=16)
sns.countplot(data = label_train, x = "abnormal")
plt.ylabel('Count', size=12)
plt.xlabel('Anomalies', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

In [None]:
def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.70):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=1):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def higher_speed(data, speed_factor = 1.25):
    return librosa.effects.time_stretch(data, speed_factor)

In [None]:
# extract features with mfcc
def extract_features(data):
    
    result = np.array([])
    mfccs = librosa.feature.mfcc(y=data, sr=22050, n_mfcc=58) #
    mfccs_processed = np.mean(mfccs.T,axis=0)
    result = np.array(mfccs_processed)
     
    return result

In [None]:
def get_features(path):
    data, sample_rate = librosa.load(path, duration=9.5, offset=0.5, res_type='kaiser_fast') 
    
    augmentations = [
        data,
        noise(data),
        shift(data)
    ]
    
    results = [extract_features(aug) for aug in augmentations]
    
    result_array = np.array(results)
    
    return result_array


## Test

In [None]:
# Fungsi untuk melakukan augmentasi pada DataFrame
def augment_data(df, times):
    augmented_dfs = []
    
    for _ in range(times):
        augmented_df = label_test.copy()  # Meng-copy DataFrame awal
        
        # Lakukan augmentasi pada augmented_df dengan menggunakan get_features
        augmented_df['features'] = augmented_df['file_path'].apply(get_features)
        
        augmented_dfs.append(augmented_df)
    
    return pd.concat(augmented_dfs, ignore_index=True)

# Menentukan berapa kali augmentasi akan dilakukan
augmentation_times = 1  # Ganti sesuai kebutuhan

# Melakukan augmentasi pada DataFrame awal
augmented_df_test = augment_data(label_test, augmentation_times)

# Menampilkan jumlah baris DataFrame hasil
print(len(augmented_df_test))

In [None]:
# Membuat kolom baru dari list yang ada
num_elements = len(augmented_df_test['features'][0][0])  # Menghitung jumlah elemen dalam setiap list dalam list

for i in range(num_elements):
    col_name = f'col_{i+1}'
    augmented_df_test[col_name] = augmented_df_test['features'].apply(lambda x: x[0][i])

In [None]:
df_test = augmented_df_test.drop(['split', 'features', 'file_path'], axis = 1)
df_test.to_csv('DfTestFix.csv', index = False)

## Train

In [None]:
# Fungsi untuk melakukan augmentasi pada DataFrame
def augment_data(df, times):
    augmented_dfs = []
    
    for _ in range(times):
        augmented_df = label_train.copy()  # Meng-copy DataFrame awal
        
        # Lakukan augmentasi pada augmented_df dengan menggunakan get_features
        augmented_df['features'] = augmented_df['file_path'].apply(get_features)
        
        augmented_dfs.append(augmented_df)
    
    return pd.concat(augmented_dfs, ignore_index=True)

# Menentukan berapa kali augmentasi akan dilakukan
augmentation_times = 1  # Ganti sesuai kebutuhan

# Melakukan augmentasi pada DataFrame awal
augmented_df= augment_data(label_train, augmentation_times)

# Menampilkan jumlah baris DataFrame hasil
print(len(augmented_df))

In [None]:
# Membuat kolom baru dari list yang ada
num_elements = len(augmented_df['features'][0][0])  # Menghitung jumlah elemen dalam setiap list dalam list

for i in range(num_elements):
    col_name = f'col_{i+1}'
    augmented_df[col_name] = augmented_df['features'].apply(lambda x: x[0][i])

In [None]:
df_train = augmented_df.drop(['split', 'features', 'file_path'], axis = 1)
df_train['abnormal'] = df_train['abnormal'].map({False: 0, True: 1})
df_train.to_csv('DfTrainFix.csv', index = False)

# Modelling

In [None]:
# Read the CSV file
train = pd.read_csv('DfTrainFix.csv')
test = pd.read_csv('DfTestFix.csv')

# Membagi variabel independen (X) dan dependen (Y)
X = train.iloc[: ,2:]
Y = train['abnormal']

In [None]:
# Menerapkan SMOTETomek
smt = SMOTETomek(random_state = 0)
X_resampled, y_resampled = smt.fit_resample(X, Y)

## Machine Learning (XGBoost)

In [None]:
# Membagi data menjadi data latih 80% dan data test 20%
x_train_ml, x_test_ml, y_train_ml, y_test_ml = train_test_split(X_resampled, y_resampled, random_state=42, test_size=0.20, shuffle=True)
x_train_ml.shape, y_train_ml.shape, x_test_ml.shape, y_test_ml.shape

In [None]:
# Transformasi data menggunakan standart scale
scaler_ml = StandardScaler()
x_train_ml = scaler_ml.fit_transform(x_train_ml)
x_test_ml = scaler_ml.transform(x_test_ml)

In [None]:
# parameter for tuning
params={
    "learning_rate":[0.05,0.10,0.15],
    "max_depth":[3,5,7,9],
    "min_child_weight":[5,7, 9],
    "gamma":[0.5,0.1,0.3],
    "colsample_bytree":[0.3,0.5,0.7],
    "random_state":[1970, 2021, 2020]
}

In [None]:
# use xgboost hyperparameter tuning with method gpu
classifier=xgb.XGBClassifier(tree_method='hist')

# Definisikan skor AUC-ROC sebagai fungsi skor
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)

# Buat objek RandomizedSearchCV dengan skor AUC-ROC dan cross-validation 5-fold
random_search = RandomizedSearchCV(classifier, param_distributions=params, n_iter=5,
                                   scoring=roc_auc_scorer, cv=5, verbose=3)

# Lakukan tuning hyperparameter dengan AUC-ROC sebagai skor
random_search.fit(x_train_ml, y_train_ml)

In [None]:
# Membuat prediksi pada data x_test_ml
y_pred = random_search.predict(x_test_ml)

# Menghitung metrik AUC-ROC
auc_roc = roc_auc_score(y_test_ml, y_pred)

print("AUC-ROC:", auc_roc)

In [None]:
# predict to data test
Y_pred = random_search.predict(x_test_ml)
# confusion matrix
cnf_matrix = confusion_matrix(y_test_ml, Y_pred)
print('Akurasi test model:', accuracy_score(y_test_ml, Y_pred))
print()
print("Confusion Matrix")
print(cnf_matrix)
print()
print(classification_report(y_test_ml, Y_pred))

## Submission ML Method

In [None]:
XTest = test.iloc[: ,2:]
XTest = scaler_ml.transform(XTest)

In [None]:
YTest = random_search.predict(XTest)
test['score'] = YTest
sub = test[['file_id', 'score']]
sub.to_csv('Sub3.csv', index = False)

## Deep Learning (LSTM)

In [None]:
def build_model(in_shape):
    weight_decay = 1e-4
    L2 = tf.keras.regularizers.l2(weight_decay)

    model = Sequential()
    model.add(Bidirectional(LSTM(128, dropout=0.05, recurrent_dropout=0.20, return_sequences=True), input_shape=(in_shape, 1)))
    model.add(Conv1D(256, kernel_size=6, strides=1, padding='same', activation='relu', kernel_regularizer=L2))
    model.add(AveragePooling1D(pool_size=4, strides=2, padding='same'))

    model.add(Conv1D(128, kernel_size=6, strides=1, padding='same', activation='relu', kernel_regularizer=L2))
    model.add(AveragePooling1D(pool_size=4, strides=2, padding='same'))

    model.add(Conv1D(128, kernel_size=6, strides=1, padding='same', activation='relu', kernel_regularizer=L2))
    model.add(AveragePooling1D(pool_size=4, strides=2, padding='same'))
    model.add(Dropout(0.2))

    model.add(Conv1D(64, kernel_size=6, strides=1, padding='same', activation='relu', kernel_regularizer=L2))
    model.add(MaxPooling1D(pool_size=4, strides=2, padding='same'))

    model.add(Flatten())
    model.add(Dense(units=32, activation='relu', kernel_regularizer=L2))
    model.add(Dropout(0.3))

    model.add(Dense(units=1, activation='sigmoid'))

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.01,
        decay_steps=10000,
        decay_rate=0.9
    )

    opt = RMSprop(lr=0.001, decay=1e-6)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['AUC'])
    return model

In [None]:
# Reduce learning rate when a metric has stopped improving.
reduce_learning_rate = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=5, verbose=1, 
    mode='auto', min_delta=1e-10, cooldown=0, min_lr=0
)

# Stop training when a monitored metric has stopped improving.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=12, verbose=1, mode='auto',
    baseline=None, restore_best_weights=False
)

In [None]:
# Callback that streams epoch results to a CSV file.
csv_logger = tf.keras.callbacks.CSVLogger(
    'da.epoch.results.csv', separator='|', append=False)

# Callback to save the Keras model or model weights at some frequency.
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "da.partial.hdf5", save_weights_only=True, mode='auto',
    save_freq='epoch', verbose=0
)

In [None]:
# Membagi data menjadi data latih 80% dan data test 20%
x_train_dl, x_test_dl, y_train_dl, y_test_dl = train_test_split(X_resampled, y_resampled, random_state=42, test_size=0.20, shuffle=True)
x_train_dl.shape, y_train_dl.shape, x_test_dl.shape, y_test_dl.shape

In [None]:
# Transformasi data menggunakan standart scale
scaler_dl = StandardScaler()
x_train_dl = scaler_dl.fit_transform(x_train_dl)
x_test_dl = scaler_dl.transform(x_test_dl)

x_train_dl = np.expand_dims(x_train_dl, axis=2)
x_test_dl = np.expand_dims(x_test_dl, axis=2)
x_train_dl.shape, y_train_dl.shape , x_test_dl.shape , y_test_dl.shape

In [None]:
# summary model
model = build_model(x_train_dl.shape[1])
model.summary()

In [None]:
epochs = 25
batch_size = 32
# fit the cnn model
history = model.fit(
    x_train_dl, y_train_dl,
    steps_per_epoch=x_train_dl.shape[0]//batch_size,
    epochs=epochs,
    validation_data=(x_test_dl, y_test_dl),
    callbacks=[csv_logger, reduce_learning_rate, early_stopping, model_checkpoint],
    verbose=1
)

## Submission DL Method

In [None]:
XTest_dl = test.iloc[: ,2:]
XTest_dl = scaler_dl.transform(XTest_dl)

In [None]:
YTest_dl = model.predict(XTest_dl)
test['score'] = YTest_dl
sub_dl = test[['file_id', 'score']]
sub_dl.to_csv('Sub2.csv', index = False)