In [7]:
# Importings base libraries
import os;
import pandas as pd; 
import numpy as np;
import librosa;
import IPython.display as ipd
import matplotlib.pyplot as plt
%matplotlib inline

# Models
import sklearn.model_selection # train_test_split
import sklearn.discriminant_analysis # LinearDiscriminantAnalysis
import sklearn.naive_bayes  # GaussianNB


seed = 42

In [8]:
# SETUP

In [9]:
## Ingestion Functions
def _process_wave_file(wave_file, labels_list, filename_list, duration_list, sr_list, data_list, interval_time = 2):
    ''' This function will append to the lists with the data from the wave file. 
    It does not have a return'''
    # Get data from the wave file:
    audio_data, sampling_rate = librosa.load(wave_file, None)
    original_filename = os.path.basename(wave_file)
    original_filename = os.path.splitext(original_filename)[0]

    # Calculate Some Attributes
    labels = list(original_filename)[0:4] # each label contains 4 letters
    frames_per_audio = sampling_rate * interval_time
    
    
    # Separate the Wave File in interval_time sections.
    rows_processed = 0
    
    for i, ini in enumerate(range(0, audio_data.shape[0], frames_per_audio)):
            
            # Calculate attributes
            this_audio = pd.Series(audio_data[ini:(ini+frames_per_audio)])
            this_duration = this_audio.shape[0]/sampling_rate
            # Update the lists with this section data.
            rows_processed += 1
            filename_list.append(original_filename)
            duration_list.append(this_duration)
            sr_list.append(sampling_rate)
            data_list.append(this_audio)
            
            
   
    # If we process more intervals than those predicted by our original_filename,
    # We label as "?"
    while(len(labels) < rows_processed):
        #print(f"adding ? to {original_filename}")
        labels.append("?")  
    
    # Update the labels list.
    labels_list.extend(labels)

def _load_wavs_from_dir(directory, verbose=False):
    # Using those imports only on this function
    from os.path import isfile, join
    from os import listdir
    
    # Reading wave files from the directory
    wave_files = [join(directory , f) for f in listdir(directory) if (isfile(join(directory, f)) and f.endswith(".wav")) ]
    
    # Creating lists that will store the data
    labels_list = list()
    filename_list = list() 
    duration_list = list()
    sr_list = list() 
    data_list = list()
    
    # Auxiliar variables
    processed = 1;   # For Verbose output
    to_be_processed = len(wave_files) # For Verbose output
    
    for file in wave_files:
        if(verbose): print(f"{file}: Processing {processed} of {to_be_processed}.")
        _process_wave_file(file, labels_list, filename_list, duration_list, sr_list, data_list)
        processed += 1
    # After process all the files, create the DataFrame
    if(verbose): print("Creating DataFrame")
    df = pd.DataFrame(data_list)
    if(verbose): print("Inserting Labels...")
    df.insert(loc  = 0, column = 'label', value = labels_list)
    if(verbose): print("Inserting Duração...")
    df.insert(loc  = 1, column = 'duracao', value = duration_list)
    if(verbose): print("Inserting Sampling Rates(sr)...")
    df.insert(loc = 2, column = 'sr', value = sr_list )
    if(verbose): print("Inserting Original Filename...")
    df.insert(loc = 3, column = "original_file", value = filename_list)
    if(verbose): print("DataFrame Created. Returning")
    return(df)

In [10]:
## Pipeline Functions 
def load_data(data_directory, output_pickle_file = None, reuse_if_exists=True):
    if(output_pickle_file):
        output_extension = os.path.splitext(output_pickle_file)[1]
        if (output_extension != ".pickle"):
            raise("Output must be a file ended with .pickle") 
    
        if( reuse_if_exists and os.path.isfile(output_pickle_file) ):
            # If the user wants to reuse existing pickle file and it exists
            return pd.read_pickle(output_pickle_file)
        
        else:
            # If the user do not wan´t to use existing file, or if it does not exists
            df = _load_wavs_from_dir(data_directory)
            df.to_pickle(output_pickle_file)
            return df
    return(_load_wavs_from_dir(data_directory))

def preprocess_data(df):
    ''' Filtre, remova nulls, e transforme os dados nessa etapa'''
    prepared = df[df.label != "?"] # Remove rows with unknown labels
    return prepared.fillna(0, inplace=False)

def extract_features(df):
    features = df.iloc[ : , df.columns.get_loc(0): ]
    return features

def extract_labels(df):
    labels = df.loc[ : , "label"]
    
    label_encoder = sklearn.preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    return labels

def score_classifier(df, y_pred):
    prepared_data = preprocess_data(df)
    y_real = extract_labels(prepared_data)
    
    print("Confusion Matrix:")
    print(sklearn.metrics.confusion_matrix(y_true=y_real, y_pred = y_pred))
    
    print("\n Other Observations:")
    
    trues = y_real == y_pred
    hits = sum(trues)
    total = len(y_pred)
    print(f"It got right: {hits} from {total} letters: {100*hits/total :.2f}%")
    word_hits = prepared_data[trues].original_file.value_counts()
    unique_words = len(df.original_file.unique())
    print(f"It received {unique_words} words. ")
    print(f"It got right 4 letters of: {sum(word_hits == 4)} words.\n" +
          f"It got right 3 letters of: {sum(word_hits == 3)} words.\n" +
          f"It got right 2 letters of: {sum(word_hits == 2)} words.\n" +
          f"It got right 1 letters of: {sum(word_hits == 1)} words.\n" +
          f"It got right 0 letters of: {unique_words - len(word_hits)} words.\n")
    
    print("Those are the words and hit count:")
    print(word_hits)
    print("Those are the letters:")
    print(prepared_data[trues].label.value_counts())
    


In [11]:
## Main functions
def process_data(training_data, validation_data, algorithm):
    # Preprocess - Filter and Imputing 
    train_data = preprocess_data(training_data)
    test_data  = preprocess_data(validation_data)
    
    # Extracting information
    x_train = extract_features(train_data)
    y_train = extract_labels(train_data)
    
    x_test = extract_features(test_data)
    y_test = extract_labels(test_data)
    
    # Fit model
    algorithm.fit(x_train, y_train)
    
    # Predict
    predict_train = algorithm.predict(x_train)
    predict_test  = algorithm.predict(x_test)
    
    score_classifier(validation_data, predict_test)
    
    return(predict_train, predict_test)
    
def process_folder(training_folder, validation_folder, algorithm):
    ## Ler os dados
    training_data = load_data(training_folder, verbose=True)
    validation_data = load_data(validation_folder, verbose=True)
    
    return process_data(training_data, validation_data, algorithm)
    

In [12]:
# RUNNING THE MODEL
## Inputs
### In-paths
train_path = ".\\dados\\TREINAMENTO\\"
test_path  = ".\\dados\\VALIDACAO\\"

### Out-paths -> Will be used to avoid having to reload all data
train_pickle = ".\\dados\\treina_1752.pickle"
test_pickle = ".\\dados\\valida_1752.pickle"

## Algorithms
lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()
nb = sklearn.naive_bayes.GaussianNB()


In [13]:
%%time
## Loading
training_data = load_data(train_path, train_pickle)
validation_data = load_data(test_path, test_pickle)

Wall time: 15.7 s


In [17]:
%%time 
## Predicting and scoring the test
train_predict_lda, test_predict_lda = process_data(training_data, validation_data, lda)



Confusion Matrix:
[[11  8  5  9 13 13  0 11 13 17]
 [ 9 16  9  6 19  8  5 15  9 18]
 [ 9  7  3  8 22 20  5  9  5  9]
 [ 9  4  5  8 19 18  1 17 12 13]
 [14  5  8 15 21 10  3  8 11 12]
 [11  2  7 12 21 17  1 12 13 14]
 [13  8  9 12 14 22  7 15  7 15]
 [ 9  7 10 12 15 15  2  7 12 15]
 [ 9  6  5 15 19 16  6 15 10 12]
 [ 7  3 10  6 14 14  2 12 10 17]]

 Other Observations:
It got right: 117 from 1068 letters: 10.96%
It received 267 words. 
It got right 4 letters of: 0 words.
It got right 3 letters of: 1 words.
It got right 2 letters of: 16 words.
It got right 1 letters of: 82 words.
It got right 0 letters of: 168 words.

Those are the words and hit count:
c7dc    3
bxx6    2
xmm7    2
6dbb    2
bn66    2
       ..
7adc    1
mxxx    1
7n66    1
hbam    1
hbd6    1
Name: original_file, Length: 99, dtype: int64
Those are the letters:
c    21
x    17
d    17
7    16
6    11
n    10
b     8
h     7
m     7
a     3
Name: label, dtype: int64
Wall time: 4min 14s


In [19]:
%%time
## LDA
### Scoring the training
score_classifier(training_data, train_predict_lda)

Confusion Matrix:
[[148   0   1   0   1   0   0   0   0   0]
 [  1 141   0   0   1   0   1   0   0   0]
 [  0   0 131   0   4   0   1   0   0   0]
 [  0   0   1 155   0   0   0   0   0   0]
 [  0   0   2   0 136   0   1   0   0   0]
 [  0   0   1   0   0 139   0   0   0   0]
 [  0   0   1   0   4   0 132   0   0   0]
 [  0   0   0   0   1   0   0 147   0   0]
 [  0   0   0   0   1   0   0   0 135   0]
 [  1   0   0   0   0   0   0   0   0 137]]

 Other Observations:
It got right: 1401 from 1424 letters: 98.38%
It received 356 words. 
It got right 4 letters of: 333 words.
It got right 3 letters of: 23 words.
It got right 2 letters of: 0 words.
It got right 1 letters of: 0 words.
It got right 0 letters of: 0 words.

Those are the words and hit count:
mdn6    4
cdma    4
d776    4
c6dn    4
ahbn    4
       ..
hnxb    3
hann    3
hhcn    3
hhha    3
abhm    3
Name: original_file, Length: 356, dtype: int64
Those are the letters:
b    155
6    148
m    147
7    141
d    139
x    137
c    13

In [14]:
%%time 
## NB
### Predict and scoring the validation
train_predict_nb, test_predict_nb = process_data(training_data, validation_data, nb)

Confusion Matrix:
[[ 6  6  2 48  2  3  0  6 16 11]
 [15  7  0 51  1  2  0  7 12 19]
 [11  5  0 47  1  5  1  8  7 12]
 [ 5  4  2 61  3  6  0  3 13  9]
 [11  8  0 59  4  2  1  2  9 11]
 [ 8  8  1 58  3  4  0  6 14  8]
 [12  3  0 55  2  5  2 11 16 16]
 [13  6  4 56  3  0  0  4  5 13]
 [10  4  2 72  0  1  0  4  9 11]
 [ 7  4  0 54  1  4  0  5  6 14]]

 Other Observations:
It got right: 111 from 1068 letters: 10.39%
It received 267 words. 
It got right 4 letters of: 0 words.
It got right 3 letters of: 1 words.
It got right 2 letters of: 11 words.
It got right 1 letters of: 86 words.
It got right 0 letters of: 169 words.

Those are the words and hit count:
dbbb    3
bxx6    2
bnbx    2
mxxx    2
7bnm    2
       ..
7bah    1
xdbc    1
bn66    1
b76b    1
xbha    1
Name: original_file, Length: 98, dtype: int64
Those are the letters:
b    61
x    14
n     9
7     7
6     6
d     4
c     4
m     4
h     2
Name: label, dtype: int64
Wall time: 1min 1s


In [16]:
%%time
### Scoring the training data
score_classifier(training_data, train_predict_nb)

Confusion Matrix:
[[35  3  6 73  1  2  0  9  6 15]
 [14 24  1 73  1  2  2  4  8 15]
 [12  6 15 79  0  2  0  4  9  9]
 [14 10  2 97  2  0  0  2 10 19]
 [ 7 11  1 72 15  2  0  7  8 16]
 [13  8  1 74  1 17  0  2 13 11]
 [13  2  1 76  3  1 13 10  6 12]
 [13  5  3 72  0  2  0 25  9 19]
 [12  8  0 73  2  2  0  2 21 16]
 [14  6  2 65  0  2  0  5  8 36]]

 Other Observations:
It got right: 298 from 1424 letters: 20.93%
It received 356 words. 
It got right 4 letters of: 0 words.
It got right 3 letters of: 13 words.
It got right 2 letters of: 59 words.
It got right 1 letters of: 141 words.
It got right 0 letters of: 143 words.

Those are the words and hit count:
6ndn    3
bmdb    3
7bbb    3
abx7    3
mnbc    3
       ..
nam6    1
7h6h    1
bx66    1
6h7d    1
7dmb    1
Name: original_file, Length: 213, dtype: int64
Those are the letters:
b    97
x    36
6    35
m    25
7    24
n    21
d    17
a    15
c    15
h    13
Name: label, dtype: int64
Wall time: 2.4 s
