In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# zscore
from scipy.stats import zscore
# scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import *
# partitioning
from sklearn.model_selection import train_test_split
# naive bayes
from sklearn.naive_bayes import GaussianNB, CategoricalNB
# plot things
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from scikitplot.metrics import plot_cumulative_gain, plot_lift_curve

ModuleNotFoundError: No module named 'scikitplot'

In [None]:
df = pd.read_csv("ravdess_features.csv")

## Classification

### Choice of attributes

Each of us select a classification method and then select two variables, one to be classified by ourselves and one to be compared with the other methods, as clustering.
Classification methods:

* Naive Bayes - Claudio
* kNN - Andrea
* Decision trees - Steffania

Possible variables:

* Categorical: emotion, sex
* Numerical: intensity

## Partitioning

In [None]:
df_num = df.drop(columns = ["modality", "vocal_channel", "emotional_intensity", "statement", "repetition", "actor", "channels", "frame_width"]).dropna()
df_num.T

In [None]:
y = np.array(df_num["emotion"]) # <--- this is the variable prediction parameter
categorical_cols = ["emotion", "sex"]
df_num = pd.get_dummies(df_num, columns=categorical_cols)
X = df_num.values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)

In [None]:
# without stratify
print(np.unique(y, return_counts=True)[1] / len(y))
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print(np.unique(y_test, return_counts=True)[1] / len(y_test))

In [None]:
# with stratify
print(np.unique(y, return_counts=True)[1] / len(y))
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print(np.unique(y_test, return_counts=True)[1] / len(y_test))

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Naive Bayes

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Low reliability of the model for the _emotion_ prediction.

In [None]:
clf.predict_proba(X_test)

In [None]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
plot_roc(y_test, clf.predict_proba(X_test))
plt.show()
print(roc_auc_score(y_test, clf.predict_proba(X_test), multi_class="ovr", average="macro"))

#### Categorical

In [None]:
non_cat_columns = [
    "intensity",
    "zero_crossings_sum",
    "length_ms",
    "frame_count",
    #"sample_width",
    #"frame_rate",
    "mfcc_mean",
    "mfcc_min",
    "mfcc_max",
    "mfcc_std",
    "sc_mean",
    #"sc_min",
    "sc_max",
    "sc_std",
    "sc_kur",
    "sc_skew",
    "stft_mean",
    #"stft_min",
    #"stft_max",
    "stft_std",
    "stft_kur",
    "stft_skew",
    "mean",
    "std",
    "min",
    "max",
    "skew",
    "kur"
]

X_noncat = df_num[non_cat_columns].values

X_train_noncat, X_test_noncat, y_train_noncat, y_test_noncat = train_test_split(
    X_noncat, y, test_size=0.3, stratify=y, random_state=0
)

# train and test set should be binned separately
X_train_cat = list()
for column_idx in range(X_train_noncat.shape[1]):
    X_train_cat.append(pd.qcut(X_train_noncat[:, column_idx], q=4, labels=False))
X_train_cat = np.array(X_train_cat).T

X_test_cat = list()
for column_idx in range(X_test_noncat.shape[1]):
    X_test_cat.append(pd.qcut(X_test_noncat[:, column_idx], q=4, labels=False))
X_test_cat = np.array(X_test_cat).T

print(X_train_cat.shape, X_test_cat.shape)

In [None]:
clf = CategoricalNB()
clf.fit(X_train_cat, y_train_noncat)
y_pred = clf.predict(X_test_cat)
print(classification_report(y_test, y_pred))

Very low reliability as regards the prediction of _emotion_ on the Categorical.