In [15]:
import librosa
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### preprocess data

In [16]:
df = pd.read_csv("data/work.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,path,accent,label,folder
0,0,common_voice_en_19687170.mp3,us,3,2
1,1,common_voice_en_19687171.mp3,us,3,2
2,2,common_voice_en_19687172.mp3,us,3,2
3,3,common_voice_en_19687173.mp3,us,3,2
4,4,common_voice_en_19687174.mp3,us,3,2


In [17]:
print(f"null values:\n{df.isnull().sum()}\n")
print(f"unique accents:\n{df['accent'].unique()}\n")

# fill null values with the most frequent value
df["accent"] = df["accent"].fillna(df["accent"].mode()[0])

null values:
Unnamed: 0        0
path              0
accent        16603
label             0
folder            0
dtype: int64

unique accents:
['us' 'wales' 'australia' nan 'england' 'indian' 'african' 'canada'
 'scotland' 'ireland' 'philippines' 'malaysia' 'other' 'singapore'
 'newzealand' 'bermuda' 'hongkong' 'southatlandtic']



In [18]:
le = LabelEncoder()
df["accent"] = le.fit_transform(df["accent"])

print(f"unique accents:\n{(df['accent'].unique())}")

unique accents:
[15 16  1  4  6  0  3 12  7 11  8 10 13  9  2  5 14]


### Use balanced subset of data
***just for the sake of testing the pipeline***

In [19]:
def create_balanced_subset(df, samples_per_class=100):
    return df.groupby("label").sample(n=samples_per_class, replace=False)

In [20]:
working_df = create_balanced_subset(df)
working_df = working_df.reset_index(drop=True)
working_df.head(400)

Unnamed: 0.1,Unnamed: 0,path,accent,label,folder
0,41700,common_voice_en_18590846.mp3,15,0,62
1,199641,common_voice_en_18302802.mp3,15,0,70
2,21965,common_voice_en_18620151.mp3,15,0,73
3,63865,common_voice_en_19991164.mp3,15,0,66
4,175345,common_voice_en_17390974.mp3,15,0,66
...,...,...,...,...,...
395,81848,common_voice_en_17576850.mp3,2,3,24
396,140668,common_voice_en_660267.mp3,4,3,39
397,176573,common_voice_en_142095.mp3,4,3,50
398,23753,common_voice_en_19607705.mp3,1,3,8


### Extract features

In [21]:
def extract_features(audio):
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)

### Load audio files

In [23]:
base_dir = "data/work"
batch_size = 20
sample_rate = 22050

X = []
y = []

total_batches = len(working_df) // batch_size

for i in range(0, len(working_df), batch_size):
    batch = working_df.loc[i : i + batch_size]

    for _, row in batch.iterrows():
        folder = str(row["folder"])
        path = row["path"]
        file_path = os.path.join(base_dir, folder, path)

        try:
            audio, _ = librosa.load(file_path, sr=sample_rate)

            # extract features
            mfccs = extract_features(audio)
            X.append(np.concatenate([mfccs, [row["accent"]]]))
            y.append(row["label"])
            # ---------------

            del audio
        except Exception as e:
            print(f"skipping {file_path}: {e}")

    del batch
    print(f"processed batch: {i//batch_size + 1}/{total_batches}")


X = np.array(X)
y = np.array(y)

  audio, _ = librosa.load(file_path, sr=sample_rate)


skipping data/work/64/common_voice_en_678905.mp3: 
processed batch: 1/20
skipping data/work/63/common_voice_en_73485.mp3: 
processed batch: 2/20
skipping data/work/68/common_voice_en_91949.mp3: 
processed batch: 3/20
processed batch: 4/20
skipping data/work/69/common_voice_en_92077.mp3: 
processed batch: 5/20
skipping data/work/50/common_voice_en_93916.mp3: 
processed batch: 6/20
skipping data/work/50/common_voice_en_93916.mp3: 
processed batch: 7/20
processed batch: 8/20
skipping data/work/25/common_voice_en_686315.mp3: 
skipping data/work/50/common_voice_en_93770.mp3: 
processed batch: 9/20
processed batch: 10/20
skipping data/work/54/common_voice_en_668064.mp3: 
processed batch: 11/20
processed batch: 12/20
skipping data/work/54/common_voice_en_668378.mp3: 
skipping data/work/34/common_voice_en_678742.mp3: 
processed batch: 13/20
skipping data/work/34/common_voice_en_678742.mp3: 
skipping data/work/11/common_voice_en_89704.mp3: 
processed batch: 14/20
processed batch: 15/20
skipping

### Split data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Train a classifier

In [28]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

### Show results

In [29]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {accuracy:.2f}")


Overall Accuracy: 0.53
