In [None]:
#! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 
%matplotlib ipympl
import numpy as np
import pandas as pd, pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
from librosa.effects import trim
from librosa import power_to_db
from librosa.feature import melspectrogram
import IPython.display as ipd
from scipy import stats
from scipy.stats import zscore
import nlpaug.augmenter.audio as naa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid, StratifiedKFold, StratifiedShuffleSplit
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt

In [None]:
dev = pd.read_csv("Materiale/development.csv")
eval = pd.read_csv("Materiale/evaluation.csv")

## ANALYSYS OF THE LABELED FILES

In [None]:
#dev.profile_report()

In [None]:
print(dev.isna().any(axis=0))
print(dev.count())

### The dataframe is complete, there is no need in having to do with missing values 

In [None]:
dev['action'].value_counts().sort_values().plot(kind="barh")

In [None]:
dev['object'].value_counts().sort_values().plot(kind="barh")

In [None]:
for action in dev['action'].unique():
    print("[", action, "]:", dev.query("action == @action")['object'].unique()) 

#### Let's create the label that will be used for the prediction 

In [None]:
y = dev['action'] + dev['object']
dev['Predicted'] = y

In [None]:
sns.set()
dev['Predicted'].value_counts().sort_values().plot(kind="barh")

In [None]:
dev['Predicted'].value_counts()

#### Let's change the path using the one that we have 

In [None]:
dst = "Materiale"
dev['path'] = dev['path'].str.replace("dsl_data", dst)

In [None]:
dev.head()

In [None]:
np.unique(dev['Predicted'], return_counts=True)

### Now add the file audio in wav format in the dataframe 

In [None]:
AudioFile = []
rateArr = []

for path in dev['path']: 
    data, sr = librosa.load(path, sr=16000, dtype='float32')
    AudioFile.append(data)
    rateArr.append(sr)
    
# Some of the files are with a sample rate of 22050 (300), while all the others are 16000. 
# For this reason it was decided to use, instead of using scipy.io.wavfile.read, librosa.load, 
# which allows us to convert files with a rate different from the default rate (which we set = 16000 
# being the most frequent one considering test sets and evaluation sets) into the rate defined 

In [None]:
#Check if all the audio files are mono or stereo 
stereo = False
for audio in AudioFile:
    if audio.shape[0] == 2:
        stereo = True
        print(f"Stereo = {stereo}")
        break

In [None]:
if stereo == True:
    i = 0
    for audio in AudioFile:
        if audio.shape[0] != 2:
            AudioFile[i] = np.concatenate([audio, audio])
        i += 1

In [None]:
dev['Audio File'] = AudioFile
#dev['Rate'] = rateArr

In [None]:
dev.head()

In [None]:
dev['Current language used for work/school'].value_counts().sort_values().plot(kind="barh")
np.unique(dev['Current language used for work/school'], return_counts=True)

In [None]:
sns.set()
dev['First Language spoken'].value_counts().sort_values().plot(kind="barh")

In [None]:
np.unique(dev['First Language spoken'], return_counts=True)

In [None]:
dev = dev[dev['First Language spoken'] == 'English (United States)']

In [None]:
dev.head()

In [None]:
dev['Current language used for work/school'].value_counts().sort_values().plot(kind="barh")
np.unique(dev['Current language used for work/school'], return_counts=True)

In [None]:
dev['Self-reported fluency level '].value_counts().sort_values().plot(kind="barh")
np.unique(dev['Self-reported fluency level '], return_counts=True)

In [None]:
x_lenghts = list(map(lambda x: len(x), dev["Audio File"]))
max(x_lenghts), min(x_lenghts)

#### La differenza massima che abbiamo tra i file è di 19 secondi (300.000 samples con una frequenza di 16000Hz) 

In [None]:
sr = 16000
def show_length_distribution(signals, rate=sr):
    sample_times = [len(x)/sr for x in signals]

    sns.set()
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.20, .80)})

    # Add a graph in each part
    sns.boxplot(x=sample_times, ax=ax_box, linewidth=0.9, color='#9af772')
    sns.histplot(x=sample_times, ax=ax_hist, bins='fd', kde=True)

    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')


    title = 'Audio signal lengths'
    x_label = 'duration (seconds)'
    y_label = 'count'

    plt.suptitle(title)
    ax_hist.set_xlabel(x_label)
    ax_hist.set_ylabel(y_label)
    plt.show()
    return sample_times


lengths = show_length_distribution(dev["Audio File"])

In [None]:
dev.head()

In [None]:
q = 95
np.percentile(lengths, q)

In [None]:
tot_outliers = sum(map(lambda x: x > np.percentile(lengths, q), lengths))
tot_outliers
#Questi sono gli outliers che abbiamo considerando il 90 percentile, valuteremo questi in modo differente a tempo debito. 

In [None]:
percentile_of_1s = stats.percentileofscore(x_lenghts, sr)
print(percentile_of_1s)

In [None]:
Longest_audio = np.argmax([len(x) for x in dev["Audio File"]])
plt.plot(dev["Audio File"][Longest_audio])
dev["Audio File"][Longest_audio]
# plt.axhline(y=3, color='r', linestyle='-')

ipd.Audio(dev["Audio File"][Longest_audio], rate=sr)

In [None]:
Shortest_audio = np.argmin([len(x) for x in dev["Audio File"]])
plt.plot(dev["Audio File"][Shortest_audio])
plt.title("Shortest audio signal")

ipd.Audio(dev["Audio File"][Shortest_audio], rate=sr)

#### We will remove the leading and trailing silence from signals to see if we get different distribution of length.

In [None]:
# by default anything below 10 db is considered as silence
def remove_silence(sample, sr=sr, top_db=10):
    """This function removes trailing and leading silence periods of audio signals.
    """
    y = np.array(sample, dtype=np.float64)
    # Trim the beginning and ending silence
    yt, _ = trim(y, top_db=top_db)
    return yt

In [None]:
dev['Audio File'] = [remove_silence(x) for x in dev["Audio File"]]

In [None]:
x_lenghts = list(map(lambda x: len(x), dev["Audio File"]))
#print(x_lenghts)
#np.unique(x_lenghts, return_counts=True)
print(max(x_lenghts), min(x_lenghts))
lengths = show_length_distribution(dev['Audio File'])

In [None]:
np.percentile(lengths, q)

In [None]:
tot_outliers = sum(map(lambda x: x > np.percentile(lengths, q), lengths))
tot_outliers
#Questi sono gli outliers che abbiamo considerando il 90 percentile, valuteremo questi in modo differente a tempo debito.

#### We considered audio files with duration shorter than 0.2 seconds to be useless, 0.2 is an arbitrary choice though, as we found out that lots of audios between 0.2 and 0.3 are records of 'play' or 'heat up', so we decided to keep them.

In [None]:
dev['Duration'] = lengths

In [None]:
df1 = dev.sort_values('Duration', ascending=False, ignore_index=True)

In [None]:
ipd.Audio(df1['Audio File'][10], rate=sr)

In [None]:
Longest_audio = np.argmax([len(x) for x in dev['Audio File']])
plt.plot(dev['Audio File'][Longest_audio])
dev['Audio File'][Longest_audio]
# plt.axhline(y=3, color='r', linestyle='-')

ipd.Audio(dev['Audio File'][Longest_audio], rate=sr)

In [None]:
for id, _ in dev[(dev['Duration'] <= 0.2)].iterrows():
    # print(id)
    dev.drop([id], axis=0, inplace=True)

In [None]:
for id, _ in dev[(dev['Duration'] >= np.percentile(lengths, q))].iterrows():
    # print(id)
    dev.drop([id], axis=0, inplace=True)

In [None]:
Longest_audio = np.argmax([len(x) for x in dev['Audio File']])
plt.plot(dev['Audio File'][Longest_audio])
dev['Audio File'][Longest_audio]
# plt.axhline(y=3, color='r', linestyle='-')

ipd.Audio(dev['Audio File'][Longest_audio], rate=sr)

In [None]:
def pad_audio(v, l):
    if l >= len(v):
        return np.pad(v, (0, l-len(v)), constant_values=0.0)
    return v[:l]

## ANALYSIS OF THE FILES THAT HAVE TO BE PREDICTED

In [None]:
eval['path'] = eval['path'].str.replace("dsl_data", dst)

AudioFile2 = []
rateArr2 = []

for path in eval['path']: 
    data, sr = librosa.load(path, sr=16000, dtype='float32')
    AudioFile2.append(data)
    rateArr2.append(sr)
    
eval['Audio File'] = AudioFile2
# eval['Rate'] = rateArr2

In [None]:
print(np.unique(rateArr2))
print(rateArr2.count(16000))
print(rateArr2.count(22050))

In [None]:
#eval.profile_report()

In [None]:
for item in eval['speakerId'].unique():
    print(dev.query("speakerId == @item")['speakerId'].count())

Gli speaker non sono gli stessi del file precedente 

In [None]:
lenghts2 = show_length_distribution(eval['Audio File'])
eval['Duration'] = lenghts2

#### These audio files have different lenghts than those labelled

In [None]:
eval['Audio File'] = [remove_silence(x) for x in eval["Audio File"]]

In [None]:
Longest_audio_eval = np.argmax([len(x) for x in eval['Audio File']])
l = max(len(dev['Audio File'][Longest_audio]), len(eval['Audio File'][Longest_audio_eval]))

In [None]:
sns.set()

y = dev['Audio File'][Longest_audio]

S = librosa.feature.melspectrogram(y=y, sr=sr, power=1)
log_S = librosa.amplitude_to_db(S, ref=np.max)
pcen_S = librosa.pcen(S * (2**31))
fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
img = librosa.display.specshow(log_S, x_axis='time', y_axis='mel', ax=ax[0])
ax[0].set(title='log amplitude (dB)', xlabel=None)
ax[0].label_outer()
imgpcen = librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[1])
ax[1].set(title='Per-channel energy normalisation')
fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
fig.colorbar(imgpcen, ax=ax[1])

In [None]:
dev['Audio File'] = [pad_audio(x, l) for x in dev['Audio File']]

In [None]:
eval['Audio File'] = [pad_audio(x, l) for x in eval['Audio File']]

In [None]:
pd.Series([len(x) for x in dev['Audio File']]).unique()

In [None]:
pd.Series([len(x) for x in eval['Audio File']]).unique()

### Data augmentation

In [None]:
rs = 42
y_dev = pd.DataFrame(dev['Predicted'], columns=['Predicted'])
X_dev = dev[['Audio File', 'Predicted']]
X_eval = eval[['Audio File']]

In [None]:
def data_augmentation(X: pd.DataFrame, y: pd.DataFrame) -> tuple:
    augmented_X = pd.DataFrame(columns=X.columns)
    augmented_y = pd.DataFrame(columns=y.columns)
    highest_pred = X['Predicted'].value_counts().index[0]
    highest_num = X['Predicted'].value_counts()[0]
    
    for pred in X.query('Predicted != @highest_pred')['Predicted'].unique():
        x = X.query('Predicted == @pred')
        len = x['Audio File'].shape[0]
        n = highest_num - len
        n1 = int(n/2)
        n2 = n - n1
        replace = n > len
        speed_sample = x.sample(n=n1, random_state=rs, replace=replace)
        speed_aug = naa.SpeedAug(zone=(0, 1), factor=(0.9, 1.1))
        for row in speed_sample.itertuples(index=False):
            aug_data = np.array(speed_aug.augment(row[0])).reshape(-1)
            aug_data = pad_audio(aug_data, l)
            new_row = pd.DataFrame({'Audio File': [aug_data], 'Predicted': [pred]})
            augmented_X = pd.concat([augmented_X, new_row], ignore_index=True)
            augmented_y = pd.concat([augmented_y, pd.DataFrame({'Predicted': [pred]})], ignore_index=True)
        
        pitch_sample = x.sample(n=n2, random_state=rs, replace=replace)
        pitch_aug = naa.PitchAug(sampling_rate=sr, zone=(0, 1), factor=(0.9, 1.1))
        for row in pitch_sample.itertuples(index=False):
            aug_data = np.array(pitch_aug.augment(row[0])).reshape(-1)
            new_row = pd.DataFrame({'Audio File': [aug_data], 'Predicted': [pred]})
            augmented_X = pd.concat([augmented_X, new_row], ignore_index=True)
            augmented_y = pd.concat([augmented_y, pd.DataFrame({'Predicted': [pred]})], ignore_index=True)
    
    return augmented_X, augmented_y

In [None]:
# X_aug_tmp, y_aug_tmp = data_augmentation(X_dev.head(10), y_dev.head(10))

In [None]:
# X_tmp = pd.concat([X_dev.head(10), X_aug_tmp])
# y_tmp = pd.concat([y_dev.head(10), y_aug_tmp])
# X_tmp['Predicted'].value_counts()

In [None]:
# y_tmp['Predicted'].value_counts()

## Feature extraction

In [None]:
def ft_mean_std(X, n, frame_size = 2048, hop_length = 512, n_mfcc = 13):
    """Computes mean and std of each n x n block of spectrograms of X
       empty bins contains mean values of that column matrices
       
    Parameters:
        X: 2-d sampling array
        n: number of rows or columns to split spectogram
    Returns:
        A 2-d numpy array - feature Matrix with n x 2 x n features as columns
    """
    X_sp = [] #feature matrix
    for signal in X:
        S = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=frame_size, hop_length=hop_length, n_mels=40, power=1)
        log_S = librosa.amplitude_to_db(S=S, ref=np.max)
        pcen_S = librosa.pcen(S=S*(2**31), sr=sr)
        mfccs = librosa.feature.mfcc(y=signal, n_mfcc=n_mfcc, sr=sr)
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)
        rms = librosa.feature.rms(y=signal).reshape(-1)
        zcr = librosa.feature.zero_crossing_rate(y=signal).reshape(-1)
        sp_cen = librosa.feature.spectral_centroid(y=signal, sr=sr, n_fft=frame_size, hop_length=hop_length).reshape(-1)
        sp_bw = librosa.feature.spectral_bandwidth(y=signal, sr=sr, n_fft=frame_size, hop_length=hop_length).reshape(-1)
        x_sp = [] #current feature set
        
        # split the rows
        for v_split in np.array_split(pcen_S, n, axis=0):
            # split the columns
            for h_split in np.array_split(v_split, n, axis=1):
                if h_split.size == 0: #happens when number of columns < n
                    m = np.median(v_split).__round__(4)
                    sd = np.std(v_split).__round__(4)
                else:
                    m = np.mean(h_split).__round__(4)
                    sd = np.std(h_split).__round__(4)
                x_sp.extend([m, sd])
                
        for v_split in np.array_split(mfccs, n_mfcc, axis=0):
            # split the columns
            for h_split in np.array_split(v_split, n, axis=1):
                if h_split.size == 0: #happens when number of columns < n
                    m = np.median(v_split).__round__(4)
                    sd = np.std(v_split).__round__(4)
                else:
                    m = np.mean(h_split).__round__(4)
                    sd = np.std(h_split).__round__(4)
                x_sp.extend([m, sd])
                
        for v_split in np.array_split(delta_mfccs, n_mfcc, axis=0):
            # split the columns
            for h_split in np.array_split(v_split, n, axis=1):
                if h_split.size == 0: #happens when number of columns < n
                    m = np.median(v_split).__round__(4)
                    sd = np.std(v_split).__round__(4)
                else:
                    m = np.mean(h_split).__round__(4)
                    sd = np.std(h_split).__round__(4)
                x_sp.extend([m, sd])
                
        for v_split in np.array_split(delta2_mfccs, n_mfcc, axis=0):
            # split the columns
            for h_split in np.array_split(v_split, n, axis=1):
                if h_split.size == 0: #happens when number of columns < n
                    m = np.median(v_split).__round__(4)
                    sd = np.std(v_split).__round__(4)
                else:
                    m = np.mean(h_split).__round__(4)
                    sd = np.std(h_split).__round__(4)
                x_sp.extend([m, sd])
                
        for h_split in np.array_split(rms, n):
            if h_split.size == 0: #happens when number of columns < n
                m = np.median(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            else:
                m = np.mean(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            x_sp.extend([m, sd])
            
        for h_split in np.array_split(zcr, n):
            if h_split.size == 0: #happens when number of columns < n
                m = np.median(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            else:
                m = np.mean(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            x_sp.extend([m, sd])
            
        for h_split in np.array_split(sp_cen, n):
            if h_split.size == 0: #happens when number of columns < n
                m = np.median(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            else:
                m = np.mean(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            x_sp.extend([m, sd])
        
        for h_split in np.array_split(sp_bw, n):
            if h_split.size == 0: #happens when number of columns < n
                m = np.median(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            else:
                m = np.mean(h_split).__round__(4)
                sd = np.std(h_split).__round__(4)
            x_sp.extend([m, sd])
            
        X_sp.append(x_sp)

    return np.array(X_sp)

In [None]:
# signal = X_dev['Audio File'].iloc[0]
# n_mfcc = 13

In [None]:
# y, sr1 = librosa.load(librosa.ex('robin'))
# S = librosa.feature.melspectrogram(y=signal, sr=sr, power=1, n_mels=40)
# log_S = librosa.amplitude_to_db(S, ref=np.max)
# pcen_S = librosa.pcen(S * (2**31), sr=sr)

In [None]:
# S.shape

In [None]:
# pcen_S

In [None]:
# ft_mean_std(X_dev['Audio File'].tail(100), 18)

### Data extraction 

In [None]:
#Code used to save the information of the data divided in bins 

#for n in range(4,25,2):
#    X_ft = ft_mean_std(dev['Audio File'], n)
#    savetxt('savedData/' + str(n) + '.csv', X_ft, delimiter=',')
    
#data = loadtxt('data.csv', delimiter=',')

In [None]:
#Per testare il funzionamento del salvataggio del file csv 

#n = 4
#X_ft1 = loadtxt('savedData/' + str(n) + '.csv', delimiter=',')
#X_ft = ft_mean_std(dev['Audio File'], n)

# Selection number of bins

In [None]:
models = {
    "rfc": RandomForestClassifier(random_state=rs),
    "svm": Pipeline([('scaler', StandardScaler()), ('SVM', svm.SVC())])
}
scores = {}
for n in range(6,27,4):
    X_ft = loadtxt('savedData/X_ft' + str(n) + '.csv', delimiter=',')
    X_train, X_test, y_train, y_test = train_test_split(X_ft, y_dev, test_size=0.20, random_state=rs)
    score = []
    for model in models:
        clf = models[model]
        clf.fit(X_train, y_train['Predicted'].to_numpy())
        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        score.append((model, f1))
    scores[n] = score

In [None]:
rf_scores = [x[0][1] for x in scores.values()]
svm_scores = [x[1][1] for x in scores.values()]
x = scores.keys()

sns.set()
fig, ax = plt.subplots(figsize=(7,7))
ax.plot(x, rf_scores, label = 'RF')
ax.plot(x, svm_scores, label= 'SVM')

ax.legend(loc='lower right')
fig.suptitle("Model evaluation on different n. of bins")
ax.set_xlabel("n. of bins")
ax.set_ylabel('mean f1 score')

## Final valuation

## RANDOM FOREST 

In [None]:
n = 22
X_ft = ft_mean_std(X_dev['Audio File'], n)
# X_ft = loadtxt('savedData/X_ft' + str(n) + '.csv', delimiter=',')

In [None]:
savetxt('savedData/X_ft' + str(n) + '.csv', X_ft, delimiter=',')

In [None]:
n = 26
X_ft = ft_mean_std(X_dev['Audio File'], n)
savetxt('savedData/X_ft' + str(n) + '.csv', X_ft, delimiter=',')

In [None]:
plt.cla()

sns.set()
fig, ax = plt.subplots(figsize=(10,5))
pca = PCA().fit(X_ft)
ax.plot(np.cumsum(pca.explained_variance_ratio_))
ax.set_xlabel('number of components')
ax.set_ylabel('cumulative explained variance')

In [None]:
n_bins = 18
pca_value = 60

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2, random_state=rs, stratify=y_dev)
X_ft_train_val, X_ft_test, _, _ = train_test_split(X_ft, y_dev, test_size=0.2, random_state=rs, stratify=y_dev)

In [None]:
param_grid = {
    'n_estimators': [150, 200, 250],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [False]
}
pg = list(ParameterGrid(param_grid))

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
accuracies1_rf = {}
for i, params in enumerate(pg):
    print(i)
    clf = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], 
                                max_features=params['max_features'], bootstrap = params['bootstrap'],
                                n_jobs=-1, random_state=rs)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rs)
    for j, (train_index, val_index) in enumerate(sss.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        
        y = np.concatenate([y, y_aug['Predicted']])
        
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        accuracies1_rf[i] = accuracy_score(y_val, y_pred)
        # print(accuracies1_rf[i])

In [None]:
accuracies1_rf = dict(sorted(accuracies1_rf.items(), key=lambda item: item[1], reverse=True))

for i, (k, v) in enumerate(accuracies1_rf.items()):
    print(f"{i}) index_rf: {k}, parameters: {pg[k]}:")
    print(v)

In [None]:
cv = 5
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
accuracies5_rf = {}
for i, params in enumerate(pg):
    print(i)
    clf = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], 
                                max_features=params['max_features'], bootstrap = params['bootstrap'],
                                n_jobs=-1, random_state=rs)
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=True)
    accuracy = []
    for j, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        
        y = np.concatenate([y, y_aug['Predicted']])
        
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        accuracy.append(accuracy_score(y_val, y_pred))
    accuracies5_rf[i] = np.mean(accuracy)

In [None]:
accuracies5_rf = dict(sorted(accuracies5_rf.items(), key=lambda item: item[1], reverse=True))
for i, (k, v) in enumerate(accuracies5_rf.items()):
    print(f"{i}) index_rf: {k} parameters: {pg[k]}:")
    print(v)

In [None]:
cv = 5
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
scores5_rf = {}
for i, params in enumerate(pg):
    print(i)
    clf = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], 
                                max_features=params['max_features'], bootstrap = params['bootstrap'],
                                n_jobs=-1, random_state=rs)
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=True)
    f1score = []
    accuracy = []
    for j, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        
        y = np.concatenate([y, y_aug['Predicted']])
        
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        f1score.append(f1_score(y_val, y_pred, average='weighted'))
        accuracy.append(accuracy_score(y_val, y_pred))
    scores5_rf[i] = [np.mean(f1score), np.mean(accuracy)]

In [None]:
scores5_rf = dict(sorted(scores5_rf.items(), key=lambda item: item[1][0], reverse=True))

for i, (k, v) in enumerate(scores5_rf.items()):
    print(f"{i}) index_rf: {k}, parameters: {pg[k]}:")
    print(v)

# Now we will use the all train_val dataset as a training set for evaluating the test set

In [None]:
pg_rf = [pg[8], pg[13], pg[14], pg[12], pg[6], pg[7]]

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
scores1_rf = {}
for i, params in enumerate(pg_rf):
    print(i)
    clf = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], 
                                max_features=params['max_features'], bootstrap = params['bootstrap'],
                                n_jobs=-1, random_state=rs)
        
    X_aug, y_aug = data_augmentation(X_train, y_train)
    X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
    
    y = np.concatenate([y_train, y_aug['Predicted']])
    
    X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
    
    X_ft_norm = maxabs_s.fit_transform(X_ft_train)
    X_ft_test_norm = maxabs_s.transform(X_ft_test)
    
    X_proj = pca.fit_transform(X_ft_norm)
    X_test_proj = pca.transform(X_ft_test_norm)
    clf.fit(X_proj, y)
    # clf.fit(X_proj, y_train['Predicted'].to_numpy())
    y_pred = clf.predict(X_test_proj)
    
    f1score = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
    scores = [f1score, acc]
    scores1_rf[i] = scores

In [None]:
scores1_rf = dict(sorted(scores1_rf.items(), key=lambda item: item[1][0], reverse=True))
for i, (k, v) in enumerate(scores1_rf.items()):
    print(f"{i}) index: {k}, parameters: {pg_rf[k]}:")
    print(v)

## Get results from Eval (RF)

In [None]:
eval_to_use = eval.loc[:, ['Audio File']]

In [None]:
#eval_ft = ft_mean_std(eval_to_use['Audio File'], n_bins)
#savetxt('savedData/eval' + str(n_bins) + '.csv', eval_ft, delimiter=',')
    
#data = loadtxt('data.csv', delimiter=',')

In [None]:
best_param = {'bootstrap': False, 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 250}

In [None]:
eval_ft = loadtxt('savedData/eval' + str(n_bins) + '.csv', delimiter=',')
print(n_bins)

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()

clf = RandomForestClassifier(n_estimators=best_param['n_estimators'], criterion=best_param['criterion'], 
                                max_features=best_param['max_features'], bootstrap = best_param['bootstrap'],
                                n_jobs=-1, random_state=rs)
    
X_aug, y_aug = data_augmentation(X_dev, y_dev)
X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)

y = np.concatenate([y_dev['Predicted'], y_aug['Predicted']])

X_ft = np.concatenate([X_ft, X_aug_ft])

X_ft_norm = maxabs_s.fit_transform(X_ft)
eval_ft_norm = maxabs_s.transform(eval_ft)

X_proj = pca.fit_transform(X_ft_norm)
eval_ft_proj = pca.transform(eval_ft_norm)
clf.fit(X_proj, y)
# clf.fit(X_proj, y_dev['Predicted'].to_numpy())

y_pred = clf.predict(eval_ft_proj)

In [None]:
with open('savedData/res_rf_aug_no_dum_corrected_1.csv', 'w') as f:
    f.write("Id,Predicted\n")
    for i in range(len(y_pred)):
      f.write("%i,%s\n" % (i, y_pred[i]))

## SVM 

Let's see the element to consider for the training doing the PCA 

In [None]:
n = 18
X_ft = loadtxt('savedData/X_ft' + str(n) + '.csv', delimiter=',')
# X_ft = ft_mean_std(X_dev['Audio File'], n)

In [None]:
plt.cla()

sns.set()
fig, ax = plt.subplots(figsize=(10,5))
pca = PCA().fit(X_ft)
ax.plot(np.cumsum(pca.explained_variance_ratio_))
ax.set_xlabel('number of components')
ax.set_ylabel('cumulative explained variance')

In [None]:
n_bins = 18
pca_value = 60

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2, random_state=rs, stratify=y_dev)
X_ft_train_val, X_ft_test, _, _ = train_test_split(X_ft, y_dev, test_size=0.2, random_state=rs, stratify=y_dev)

In [None]:
param_grid = {
    'C': [1, 10, 100, 1000],
    'gamma': ['scale', 1, 0.1], 
    'kernel': ['rbf'],
    'class_weight': [None, 'balanced']
}
pg = list(ParameterGrid(param_grid))

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
accuracies1_svm = {}
for i, params in enumerate(pg):
    print(i)
    clf = svm.SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], class_weight=params['class_weight'])
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rs)
    for j, (train_index, val_index) in enumerate(sss.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        y = pd.concat([y, y_aug])
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        accuracies1_svm[i] = accuracy_score(y_val, y_pred)
        print(accuracies1_svm[i])

In [None]:
accuracies1_svm = dict(sorted(accuracies1_svm.items(), key=lambda item: item[1], reverse=True))

for i, (k, v) in enumerate(accuracies1_svm.items()):
    print(f"{i}) index_svm: {k}, parameters: {pg[k]}:")
    print(v)

In [None]:
cv = 5
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
accuracies5_svm = {}
for i, params in enumerate(pg):
    print(i)
    clf = svm.SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], class_weight=params['class_weight'])
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=True)
    accuracy = []
    for j, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        y = np.concatenate([y, y_aug])
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        accuracy.append(accuracy_score(y_val, y_pred))
    accuracies5_svm[i] = np.mean(accuracy)

In [None]:
accuracies5_svm = dict(sorted(accuracies5_svm.items(), key=lambda item: item[1], reverse=True))
for i, (k, v) in enumerate(accuracies5_svm.items()):
    print(f"{i}) index_svm: {k} parameters: {pg[k]}:")
    print(v)

##### Valutiamo anche l'f1-score weighted

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
scores1_svm = {}
for i, params in enumerate(pg):
    print(i)
    clf = svm.SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], class_weight=params['class_weight'])
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rs)
    for j, (train_index, val_index) in enumerate(sss.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        y = np.concatenate([y, y_aug])
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        f1score = f1_score(y_val, y_pred, average='weighted')
        acc = accuracy_score(y_val, y_pred)
        scores = [f1score, acc]
        scores1_svm[i] = scores
        # print(fscore)
        # print(accuracy_score(y_val, y_pred))

In [None]:
fscore1_svm = dict(sorted(scores1_svm.items(), key=lambda item: item[1][0], reverse=True))

for i, (k, v) in enumerate(fscore1_svm.items()):
    print(f"{i}) index_svm: {k}, parameters: {pg[k]}:")
    print(v)

In [None]:
cv = 5
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
scores5_svm = {}
for i, params in enumerate(pg):
    print(i)
    clf = svm.SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], class_weight=params['class_weight'])
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=True)
    f1score = []
    accuracy = []
    for j, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X = X_train.iloc[train_index, :]
        y = y_train.iloc[train_index]
        X_val = X_train.iloc[val_index, :]
        y_val = y_train.iloc[val_index]
        
        X_ft_train = X_ft_train_val[train_index, :]
        X_ft_val = X_ft_train_val[val_index, :]
        
        X_aug, y_aug = data_augmentation(X, y)
        X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
        y = np.concatenate([y, y_aug])
        X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
        
        X_ft_norm = maxabs_s.fit_transform(X_ft_train)
        X_ft_val_norm = maxabs_s.transform(X_ft_val)
        
        X_proj = pca.fit_transform(X_ft_norm)
        X_val_proj = pca.transform(X_ft_val_norm)
        
        clf.fit(X_proj, y)
        # clf.fit(X_proj, y['Predicted'].to_numpy())
        y_pred = clf.predict(X_val_proj)
        f1score.append(f1_score(y_val, y_pred, average='weighted'))
        accuracy.append(accuracy_score(y_val, y_pred))
    scores5_svm[i] = [np.mean(f1score), np.mean(accuracy)]

In [None]:
scores5_svm = dict(sorted(scores5_svm.items(), key=lambda item: item[1][0], reverse=True))

for i, (k, v) in enumerate(scores5_svm.items()):
    print(f"{i}) index_svm: {k}, parameters: {pg[k]}:")
    print(v)

# Now we will use the all train_val dataset as a training set for evaluating the test set

In [None]:
svm_pg = [pg[11], pg[8], pg[17], pg[23], pg[14], pg[20]]

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()
scores1_svm = {}
for i, params in enumerate(svm_pg):
    print(i)
    # print(params)
    clf = svm.SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], class_weight=params['class_weight'])
        
    X_aug, y_aug = data_augmentation(X_train, y_train)
    X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)
    y = np.concatenate([y_train, y_aug])
    X_ft_train = np.concatenate([X_ft_train, X_aug_ft])
    
    X_ft_norm = maxabs_s.fit_transform(X_ft_train)
    X_ft_test_norm = maxabs_s.transform(X_ft_test)
    
    X_proj = pca.fit_transform(X_ft_norm)
    X_test_proj = pca.transform(X_ft_test_norm)
    # clf.fit(X_proj, y)
    clf.fit(X_proj, y_train['Predicted'].to_numpy())
    y_pred = clf.predict(X_test_proj)
    
    f1score = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
    scores = [f1score, acc]
    scores1_svm[i] = scores

In [None]:
scores1_svm = dict(sorted(scores1_svm.items(), key=lambda item: item[1][0], reverse=True))
for i, (k, v) in enumerate(scores1_svm.items()):
    print(f"{i}) index_svm: {k} parameters: {svm_pg[k]}:")
    print(v)

## Get results from Eval 

In [None]:
# eval_to_use = eval.loc[:, ['Audio File']]
# eval_to_use.head, eval_dum.head

In [None]:
#eval_ft = ft_mean_std(eval_to_use['Audio File'], n_bins)
#savetxt('savedData/eval' + str(n_bins) + '.csv', eval_ft, delimiter=',')
    
#data = loadtxt('data.csv', delimiter=',')

In [None]:
# savetxt('savedData/eval' + str(18) + '.csv', eval_ft, delimiter=',')

In [None]:
best_param = {'C': 10, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}

In [None]:
eval_ft = loadtxt('savedData/eval' + str(n_bins) + '.csv', delimiter=',')
# eval_ft_dum = np.concatenate([eval_ft, eval_dum], axis=1)
print(n_bins)

In [None]:
pca = PCA(n_components=pca_value)
maxabs_s = MaxAbsScaler()

clf = svm.SVC(C=best_param['C'], kernel=best_param['kernel'],
              gamma=best_param['gamma'], class_weight=best_param['class_weight'])
    
X_aug, y_aug = data_augmentation(X_dev, y_dev)
X_aug_ft = ft_mean_std(X_aug['Audio File'], n_bins)

y = np.concatenate([y_dev['Predicted'], y_aug['Predicted']])

X_ft = np.concatenate([X_ft, X_aug_ft])

X_ft_norm = maxabs_s.fit_transform(X_ft)
eval_ft_norm = maxabs_s.transform(eval_ft)

X_proj = pca.fit_transform(X_ft_norm)
eval_ft_proj = pca.transform(eval_ft_norm)
clf.fit(X_proj, y)
# clf.fit(X_proj, y_dev['Predicted'].to_numpy())

y_pred = clf.predict(eval_ft_proj)

In [None]:
with open('savedData/res_svm_aug_no_dummies_1.csv', 'w') as f:
    f.write("Id,Predicted\n")
    for i in range(len(y_pred)):
      f.write("%i,%s\n" % (i, y_pred[i]))