## <div align="center">Data Preprocessing</div>

In [9]:
import os
import numpy as np
import pandas as pd
from tables import *
import h5py
import matplotlib.pyplot as plt
import seaborn as sns

### INTRODUCTION: in this Jupyter Notebook I will upload , preprocess and save data for SVM and K-Means. 

### Part 1. Start working with FILE data
### Part 2. Prepare functions
### Part 3. Run all the functions and prepare data for SVM and K-Means
### Part 4. Conclusion

## <div align="center">Part 1. Start working with FILE data</div>

#### Read CSV. Chage .wav to .hdf5

In [10]:
# specify YOUR PATH to the CSV file
csv_path=os.path.join('ESC50_Home','ESC-50-HumanClassification.csv')

# read the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# check if all elements in the 'filename' column end with '.wav'
result = (df['filename'].str.endswith('.wav')).all()

# print the result
print(result)

# replace '.wav' with '.hdf5' in the 'filename' column
df['filename'] = df['filename'].str.replace('.wav', '.hdf5')

True


  df['filename'] = df['filename'].str.replace('.wav', '.hdf5')


#### Read categories from csv file

In [11]:
unique_categories = df['category'].unique()

# print the unique categories
print(unique_categories)

count_of_unique_categories = df['category'].nunique()
print(count_of_unique_categories)
print("")
print("Completed")

['dog' 'chirping_birds' 'vacuum_cleaner' 'thunderstorm' 'door_wood_knock'
 'can_opening' 'crow' 'clapping' 'fireworks' 'chainsaw' 'airplane'
 'mouse_click' 'pouring_water' 'train' 'sheep' 'water_drops'
 'church_bells' 'clock_alarm' 'keyboard_typing' 'wind' 'footsteps' 'frog'
 'cow' 'brushing_teeth' 'car_horn' 'crackling_fire' 'helicopter'
 'drinking_sipping' 'rain' 'insects' 'laughing' 'hen' 'engine' 'breathing'
 'crying_baby' 'hand_saw' 'coughing' 'glass_breaking' 'snoring'
 'toilet_flush' 'pig' 'washing_machine' 'clock_tick' 'sneezing' 'rooster'
 'sea_waves' 'siren' 'cat' 'door_wood_creaks' 'crickets']
50

Completed


In [12]:
# list of categories taken from a blackboard
categories = [
    'Breathing',
    'Coughing',
    'Footsteps',
    'Laughing',
    'Sneezing',
    'Snoring',
    'Toilet_flush',
    'Vacuum_cleaner',
    'Washing_machine',
]

# change the capital letter to lowercase
categories = [category.lower() for category in categories]

# print the modified list
print(categories)
print("")
print("Completed")

%timeit

['breathing', 'coughing', 'footsteps', 'laughing', 'sneezing', 'snoring', 'toilet_flush', 'vacuum_cleaner', 'washing_machine']

Completed


#### Make a dictionary with keys=categories and values=category_element

In [13]:
# Create empty dict for storing filenames for each category
filenames_by_category  = {}
number_of_files = 0

# iterate over the categories
for category in categories:
    # get all the filenames for the current category
    filenames = df.loc[df['category'].isin([category]), 'filename'].values.tolist()
    number_of_files += len(filenames)
    # create a key-value pair in the dictionary with the current category as the key and the list of filenames as the value
    filenames_by_category[category] = filenames

print("The number of files %f" %number_of_files)
print("Completed")

The number of files 360.000000
Completed


#### Make a dictionary with keys=categories and values=path_to_element

In [14]:
# Create empty dict for storing paths to files for each category
hdf5_paths_per_category = {}

# In order to control the flow of the code and not miss a change I keep trck on the number of files.
counter_files = 0
counter_categories = 0 
filename = "ptne"

# iterate over the categories
for category in categories:
    counter_categories += 1
    
    # Create an empty list where I store paths for one type of category
    all_paths_for_same_category = list()
    
    # iterate over the list that belongs to exact category
    for i in filenames_by_category[category]:
        counter_files += 1
        
        # Create variable that store a single path
        # CHANGE TO YOUR PATH
        temp_hdf5_path=os.path.join('ESC50_Home', filename, i)
        
        # Store paths for one type of category in a list.
        # With the new category the list will be updated to 0
        all_paths_for_same_category.append(temp_hdf5_path)
        
    # Store a list according to the category
    hdf5_paths_per_category[category] = all_paths_for_same_category


print(type(hdf5_paths_per_category["snoring"]))
print(len(hdf5_paths_per_category["snoring"]))

print("The number of categories %f" %counter_categories)
print("The number of files %f" %counter_files)
print("Completed")

<class 'list'>
40
The number of categories 9.000000
The number of files 360.000000
Completed


In [15]:
#hdf5 files read in using h5py modules
tract_file = h5py.File(hdf5_paths_per_category["snoring"][0], 'r')

# Print out
print("Tract Labels: %s" % tract_file.keys())

# Save Labels
TRACT_Labels  = list(tract_file.keys()) # # Has 3 values inside

# Keys are listed out
print("Tract Labels: %s" % TRACT_Labels)
print("Check one Label: %s" % TRACT_Labels[1])



Tract Labels: <KeysViewHDF5 ['energy', 'noise', 'pulse', 'tone']>
Tract Labels: ['energy', 'noise', 'pulse', 'tone']
Check one Label: noise


In [16]:
tr_label_1 = TRACT_Labels[0]
tr_label_2 = TRACT_Labels[1]
tr_label_3 = TRACT_Labels[2]

print(tr_label_1)
print(tr_label_2)
print(tr_label_3)

energy
noise
pulse


## <div align="center">Part 2. Prepare functions</div>

In [17]:
def create_arr_all_files_of_category(category_name, labels):
    """ This code extracts data from hdf5 files, access the s_tract dataset in each file,  
    and append a portion of that data to a list called list_of_data"""
    
    counter_files = 0
    list_of_data = list() # Create list. Here we will store all arrays for 1 category
    
    for i in hdf5_paths_per_category[category_name]:
        
        # Here we iterate over every file of one category
        with h5py.File(i, 'r') as f:
#             print(f)
            
            # access the "s_tract" dataset in the file
            s_tract = f[labels][()]
            
            # Append a file and get rid of zeros in the bottom and in the top
            list_of_data.append(s_tract)
        
        counter_files += 1
        
#     print("Completed")
    return list_of_data

In [18]:
def split_arrat_func(file_2D_data, sep_number):
    
    # Preparation before splitting
    length_of_data = len(file_2D_data)
    
    # Split without remain
    sub_arr_size = length_of_data // sep_number
    safe_separate = sub_arr_size * sep_number
    
    # The main part - splitting
    splited_lists = np.split(np.array(file_2D_data[:safe_separate]), sep_number)

    return splited_lists
    

Here I get my features.
I am going to give bins=4 and thus, have 4 features.

 Based on observation of a histograms in file 1.0, I decided do not take values that are over 100.
 Thus I can keep temporal alignment. 

In [19]:
def get_feature(splited_array):
    list_of_features = np.array([]) # 1
    # Features consistency cut out the border.
    
    for i in splited_array:
        
#         Every array has a shape  (14, 8498) or (14, 8497). It does not matter for us because we will flatten them.
        arr_one_flat = i.flatten() # 246413
        
        # Based on observation of a histogram, I decided do not take values that are over 175
        # Thus I can keep temporal alignment. 
        arr_one_flat = arr_one_flat[arr_one_flat <= 100]

        hist1, bins1 = np.histogram(arr_one_flat,bins=4)
        list_of_features = np.append(list_of_features, hist1) 

    
    return list_of_features

## The core function where we link all the function above and make labels and features.

In [20]:
def prepare_data_for_SVM(array_with_catigories):

    final_label = list()
    final_features = list()
    
    # Iterate over labelf of HDF5 file - ['E', 'f_tract', 's_tract']
    for label in TRACT_Labels:
    
        all_categories_features = list()  # Global array for features
        label_array = list()              # For collecting labels
        num_label = 0       # Every iteration of a category num_label will +1 and label_array will recieve new values

        for i in array_with_catigories:
            print("")
            print("Category : ", i)
            one_category_features = list() # Local array, for features only for one category 

            arr_all_files_one_category = create_arr_all_files_of_category(i, label) # List, all of 40 files of ine category.

            # Get access to individual file
            for i in arr_all_files_one_category:

                splited_lists = split_arrat_func(i, 1)  # list (6). 
                hey = get_feature(splited_lists) #  np.array - get features for one file - for example (60,) if I have 6 subarray and 10 bins
                one_category_features.append(hey) # list - store features in the list

            one_category_features = np.array(one_category_features) # (40, 60) - feature set(60) for 40 samples - one category
            all_categories_features.append(one_category_features) # store seatures set in a list. Len will be 9.

            label_1 = np.zeros(len(arr_all_files_one_category), dtype=int) + num_label

            label_array.append(label_1) 

            num_label += 1


        print("Saving...")

        # We cannot accept this shape because SVM get recieve only 2D array for features and 1D for labels
        all_categories_features = np.array(all_categories_features) # (9, 40, 60) - 9 categories, 40 samples, 60 features

        label_array = np.array(label_array) # Shape (9, 40)

        # This part is rehsaping 
        num_of_samples = len(arr_all_files_one_category) # = 40
        num_of_categories = len(array_with_catigories) # = 9
        num_rows = num_of_samples*num_of_categories # = 360

        num_of_col = len(hey) # = num of features for 1 file

        all_categories_features = all_categories_features.reshape(num_rows,num_of_col) # Shape (360, 60) for all 9 categories
        label_array = label_array.reshape(num_rows) # Shape (360,) for all 9 categories

    
        final_features.append(all_categories_features)
        final_label.append(label_array)
        
    final_features = np.array(final_features)
    final_label = np.array(final_label)
    
    # Prepre variablef for reshaping of finale features
    num_rows_2 = int(len(final_features) * len(final_features[0]))
    num_of_col_2 = int(len(final_features[0][0]))


    print(final_features.shape)
    print(type(num_rows_2))
    print(num_rows_2)
    print("")
    
    # Reshaping
    final_features_done = final_features.reshape(num_rows_2, num_of_col_2) # Shape (360, 60) for all 9 categories
    final_label_done = final_label.reshape(num_rows_2) # Shape (360,) for all 9 categories
    
    # SHAPE CONTROL
    
    print("")
    print("Completed prepare_data_for_SVM")
    
    return final_features_done, final_label_done, final_features

## <div align="center">Part 3. Run all the functions and prepare data for SVM  and K-Means</div>

In [21]:
# test_category = ['breathing', 'footsteps']
# features_array,label_array, test_hist = prepare_data_for_SVM(test_category) # "QUICKTEST"

In [22]:
features_array,label_array, test_hist = prepare_data_for_SVM(categories) # "FULLTEST"


Category :  breathing

Category :  coughing

Category :  footsteps

Category :  laughing

Category :  sneezing

Category :  snoring

Category :  toilet_flush

Category :  vacuum_cleaner

Category :  washing_machine
Saving...

Category :  breathing

Category :  coughing

Category :  footsteps

Category :  laughing

Category :  sneezing

Category :  snoring

Category :  toilet_flush

Category :  vacuum_cleaner

Category :  washing_machine
Saving...

Category :  breathing

Category :  coughing

Category :  footsteps

Category :  laughing

Category :  sneezing

Category :  snoring

Category :  toilet_flush

Category :  vacuum_cleaner

Category :  washing_machine
Saving...

Category :  breathing

Category :  coughing

Category :  footsteps

Category :  laughing

Category :  sneezing

Category :  snoring

Category :  toilet_flush

Category :  vacuum_cleaner

Category :  washing_machine
Saving...
(4, 360, 4)
<class 'int'>
1440


Completed prepare_data_for_SVM


Save the data for easy access in the future

In [23]:
# Saving
np.save('features_array_ptne_1.0.npy', features_array)
np.save('label_array_ptne_1.0.npy', label_array)

In [24]:
# Check the shapes if they're ready to feed SVM
print(type(features_array))
print(features_array.shape)

print(type(label_array))
print(label_array.shape)

print("Cool")

<class 'numpy.ndarray'>
(1440, 4)
<class 'numpy.ndarray'>
(1440,)
Cool


## <div align="center">Part 5. Conclusion</div>

1. Temporal alignment is applied 
2. Multiple feature sets are used 
3. Data is prepared for the SVM and K-Means