In [1]:
import sys
import json
import pandas as pd
import numpy as np
sys.path.insert(1, 'C:/Users/Admin/Projects/voice-anomaly-detector')

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report

from models.baseline import Baseline
from models.lightgbm_interface import LightGBMInterface

import warnings
warnings.filterwarnings('ignore')

In [2]:
columns = ['Zero crossings',
       'MFCC mean 1', 'MFCC mean 2', 'MFCC mean 3',  'MFCC mean 5',
       'MFCC mean 8', 'MFCC mean 10',
       'MFCC mean 12', 'MFCC mean 13',
       'MFCC max_min 1', 'MFCC max_min 2', 'MFCC max_min 3', 'MFCC max_min 4', 'MFCC max_min 5', 
       'MFCC max_min 6', 'MFCC max_min 7', 'MFCC max_min 8', 'MFCC max_min 9', 'MFCC max_min 10', 
       'MFCC max_min 11', 'MFCC max_min 12', 'HNR', 'f_0', 'Jitter']

In [6]:
def train_model(df, params):
    k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    splits = k_fold.split(df, df["Label"])

    test_result = []
    train_result = []

    encode_labels = preprocessing.LabelEncoder()
    encode_labels.fit(df["Label"])
    labels = encode_labels.transform(df["Label"])

    for train_index, test_index in splits:

        df = df[columns]
        train_x = df.loc[train_index]
        test_x = df.loc[test_index]
        labels_train = labels[train_index]
        labels_test = labels[test_index]

        # model = Baseline()
        # model = LogisticRegression(C=30, solver='newton-cg')
        model = RandomForestClassifier(**params)
        # model = SVC(**params)
        # model = LightGBMInterface(params=params)

        model.fit(train_x, labels_train)
        y_predict_test = model.predict(test_x)
        y_predict_train = model.predict(train_x)
        score_test = f1_score(labels_test, y_predict_test)
        score_train = f1_score(labels_train, y_predict_train)
        print(classification_report(labels_test, y_predict_test))
        test_result.append(score_test)
        train_result.append(score_train)
    return test_result, train_result

In [7]:
if __name__ == '__main__':
    df = pd.read_csv('../data/features.csv')

    params = {"n_estimators": 150, "max_depth": 30, "min_samples_split": 2, "min_samples_leaf": 10, "max_samples": 100}
    # Random Forest params
    # params = {'num_iterations': 40, 'num_leaves': 50, 'min_data_in_leaf': 40, 'objective': 'binary'}  # LightGBM params
    # params = {'C': 5, 'kernel': 'sigmoid', 'gamma': 'auto'} # SVM params

    test_result, train_result = train_model(df, params)

    result_test_mean = np.mean(test_result)
    result_test_std = np.std(test_result)
    print(f'Test score: {result_test_mean}')
    print(f'STD: {result_test_std}')

    result_train_mean = np.mean(train_result)
    result_train_std = np.std(train_result)
    print('Train score: ', result_train_mean)
    print('STD: ', result_train_std)

              precision    recall  f1-score   support

           0       0.65      0.24      0.35       138
           1       0.71      0.93      0.80       271

    accuracy                           0.70       409
   macro avg       0.68      0.59      0.58       409
weighted avg       0.69      0.70      0.65       409

              precision    recall  f1-score   support

           0       0.84      0.23      0.36       138
           1       0.71      0.98      0.83       270

    accuracy                           0.73       408
   macro avg       0.78      0.60      0.59       408
weighted avg       0.76      0.73      0.67       408

              precision    recall  f1-score   support

           0       0.67      0.12      0.20       137
           1       0.68      0.97      0.80       271

    accuracy                           0.68       408
   macro avg       0.68      0.54      0.50       408
weighted avg       0.68      0.68      0.60       408

              preci