# Data processing

In [137]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [138]:
dataset = pd.read_csv('D:\VoiceClone\Feature Extraction\data.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [139]:
rows_with_nan_or_zero = []
for i in range(X.shape[0]):
    if np.isnan(X[i, :]).any():
        rows_with_nan_or_zero.append(i)
        continue
rows_with_nan_or_zero

[100,
 417,
 441,
 669,
 925,
 1364,
 1843,
 2012,
 2166,
 2259,
 2302,
 2352,
 2911,
 3079,
 3157,
 3208,
 3230,
 3313,
 3377,
 3792,
 3915,
 3926,
 4137,
 4534,
 4556,
 4788,
 4841,
 5093,
 5186,
 5944]

In [140]:
X = np.delete(X, rows_with_nan_or_zero, axis=0)
y = np.delete(y, rows_with_nan_or_zero, axis=0)

In [141]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [142]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [144]:
y_train[1:20]

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0])

In [145]:
for i in range (len(X_train)):
    for j in range (len(X_train[i])):
        if X_train[i][j] == "NaN":
            print(i,j)

In [146]:
X_train[18:22]

array([[ 197.93082293,  172.35996358,  208.85526661,   36.49530303,
        1380.52805136,   29.55084132],
       [ 178.54923012,  164.81652795,  187.68884053,   22.87231258,
        1578.73449724,   17.77418613],
       [ 110.25922598,  100.28947374,  115.20235351,   14.91287977,
        1882.81038501,   16.15456542],
       [ 118.4173323 ,  108.73654224,  123.1152555 ,   14.37871326,
        1392.03289815,   15.5778053 ]])

# Training

### Random forest

In [147]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=33, criterion="entropy", min_samples_split=2, min_samples_leaf=3, max_features="sqrt")
find_nan = 21
classifier.fit(X_train,y_train)

#### Predict the test

In [148]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [149]:
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
acc_score =accuracy_score(y_test, y_pred)
print(cm)
print(acc_score)

[[390  73]
 [112 618]]
0.8449287510477788


### Feature important

In [150]:
column_names = dataset.columns.tolist()
column_names

['Unnamed: 0',
 'f0',
 'q1',
 'q3',
 'iqr',
 'spectral_centroid',
 'f0_standard_deviation',
 'label']

In [151]:
feature_importances = classifier.feature_importances_

for i in range(len(feature_importances)):
    print(f"{column_names[i+1]}: {feature_importances[i]}")

f0: 0.24281094278786644
q1: 0.2742808858923657
q3: 0.22437766900746992
iqr: 0.08947430885491486
spectral_centroid: 0.06987074863510538
f0_standard_deviation: 0.09918544482227744
