# Real-time audio preprocessing notebook

* Start by importing necessary preprocessing libraries
* visualize how much of each type of data we have 
* split the data into train test and validation sets 
* set aside a certain amount of the 2 classes of data in proportion to how large our splits are 
* save the dataframes so that they can be loaded into our training notebook later down the line

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa 

In [14]:
from enum import Enum

# import the differnt types of data
class DataClass(Enum):
    SAFE = "safe"
    DANGEROUS = "dangerous"

dangerous = pd.DataFrame({
    'name': pd.Series(dtype='str'),
    'length (minutes)': pd.Series(dtype='int'),
    'path': pd.Series(dtype='str'),
    'class': pd.Series(dtype='category')
})

safe = pd.DataFrame({
    'name': pd.Series(dtype='str'),
    'length (minutes)': pd.Series(dtype='int'),
    'path': pd.Series(dtype='str'),
    'class': pd.Series(dtype='category')
})

In [15]:
import os

# get the data from the files
def load_dangerous_data():
    global dangerous
    for file in os.listdir(r"C:\Users\2005e\OneDrive\Documents\GitHub\TTC-Listen-2\models\data\dangerous"):
        if file.endswith(".wav"):
            y, sr = librosa.load(f"data/dangerous/{file}", sr=44100)
            duration = librosa.get_duration(y=y, sr=sr)
            dangerous = pd.concat([dangerous, pd.DataFrame({
                'name': [file],
                'length (minutes)': [duration / 60],
                'path': [f"data/dangerous/{file}"],
                'class': [DataClass.DANGEROUS]
            })], ignore_index=True)

def load_safe_data():
    global safe
    for file in os.listdir(r"C:\Users\2005e\OneDrive\Documents\GitHub\TTC-Listen-2\models\data\safe"):
        if file.endswith(".wav"):
            y, sr = librosa.load(f"data/safe/{file}")
            duration = librosa.get_duration(y=y, sr=44100)
            safe = pd.concat([safe, pd.DataFrame({
                'name': [file],
                'length (minutes)': [duration / 60],
                'path': [f"data/safe/{file}"],
                'class': [DataClass.SAFE]
            })], ignore_index=True)

load_dangerous_data()
load_safe_data()

In [4]:
dangerous.head()

Unnamed: 0,name,length (minutes),path,class
0,1 Hitter Quitter.wav,2.052451,data/dangerous/1 Hitter Quitter.wav,DataClass.DANGEROUS
1,14 year old boy wants to fight a grown man no ...,0.506195,data/dangerous/14 year old boy wants to fight ...,DataClass.DANGEROUS
2,ANGRY BLACK MAN ON NYC SUBWAY HATES WOMEN!.wav,9.825911,data/dangerous/ANGRY BLACK MAN ON NYC SUBWAY H...,DataClass.DANGEROUS
3,ANOTHER FIGHT ON THE 2 TRAIN NYC SMFH LMFAO! @...,2.828965,data/dangerous/ANOTHER FIGHT ON THE 2 TRAIN NY...,DataClass.DANGEROUS
4,Argument Ends With Dude Getting Jumped For Get...,3.342899,data/dangerous/Argument Ends With Dude Getting...,DataClass.DANGEROUS


In [5]:
safe.head()

Unnamed: 0,name,length (minutes),path,class
0,TTC Subway 100.wav,5.921944,data/safe/TTC Subway 100.wav,DataClass.SAFE
1,TTC Subway 101.wav,3.405497,data/safe/TTC Subway 101.wav,DataClass.SAFE
2,TTC Subway 103.wav,7.752272,data/safe/TTC Subway 103.wav,DataClass.SAFE
3,TTC Subway 104.wav,8.777778,data/safe/TTC Subway 104.wav,DataClass.SAFE
4,TTC Subway 105.wav,3.527583,data/safe/TTC Subway 105.wav,DataClass.SAFE


In [24]:
#print out total duration of both 
total_dangerous_duration = dangerous['length (minutes)'].sum()/60
total_safe_duration = safe['length (minutes)'].sum()/60

print(f"Total dangerous duration: {total_dangerous_duration} hours")
print(f"Total safe duration: {total_safe_duration} hours")

Total dangerous duration: 8.103048286722096 hours
Total safe duration: 12.226635487528343 hours


In [7]:
train_split = 0.8
validation_split = 0.2

In [22]:
def split_data_by_duration(data):
    data = data.sort_values(by='length (minutes)', ascending=False)
    train_duration = data['length (minutes)'].sum() * train_split
    train = pd.DataFrame()
    validation = pd.DataFrame()
    current_duration = 0

    for index, row in data.iterrows():
        if current_duration + row['length (minutes)'] <= train_duration:
            train = pd.concat([train, pd.DataFrame([row])], ignore_index=True)
            current_duration += row['length (minutes)']
        else:
            validation = pd.concat([validation, pd.DataFrame([row])], ignore_index=True)

    return train, validation

train_dangerous, validation_dangerous = split_data_by_duration(dangerous)
train_safe, validation_safe = split_data_by_duration(safe)

print("Total duration of train dangerous data (hours):", train_dangerous['length (minutes)'].sum() / 60)
print("Total duration of validation dangerous data (hours):", train_safe['length (minutes)'].sum() / 60)

Total duration of train dangerous data (hours): 6.4820840702947855
Total duration of validation dangerous data (hours): 9.781063321995465


In [17]:
total_train_duration = train_dangerous['length (minutes)'].sum() + train_safe['length (minutes)'].sum()
total_train_duration / 60  # Convert to hours


np.float64(16.26314739229025)

In [18]:
def one_hot_encode_and_restructure(df):
    df['class'] = df['class'].apply(lambda x: 1 if x == DataClass.DANGEROUS else 0)
    return df[['name', 'length (minutes)', 'path', 'class']]

train_dangerous_encoded = one_hot_encode_and_restructure(train_dangerous)
validation_dangerous_encoded = one_hot_encode_and_restructure(validation_dangerous)
train_safe_encoded = one_hot_encode_and_restructure(train_safe)
validation_safe_encoded = one_hot_encode_and_restructure(validation_safe)

In [19]:
# shuffle  dangerous and safe data 
train = pd.concat([train_dangerous_encoded, train_safe_encoded], ignore_index=True)
train = train.sample(frac=1, random_state=42).reset_index(drop=True)

test = pd.concat([validation_dangerous_encoded, validation_safe_encoded], ignore_index=True)
test = test.sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
train_safe_length = train[train['class'] == 0]['length (minutes)'].sum()
train_dangerous_length = train[train['class'] == 1]['length (minutes)'].sum()

print(f"Total safe length in train (minutes): {train_safe_length}")
print(f"Total dangerous length in train (minutes): {train_dangerous_length}")

Total safe length in train (minutes): 9.781063321995465
Total dangerous length in train (minutes): 6.4820840702947855


In [12]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
