# Real-time audio preprocessing notebook

* Start by importing necessary preprocessing libraries
* visualize how much of each type of data we have 
* split the data into train test and validation sets 
* set aside a certain amount of the 2 classes of data in proportion to how large our splits are 
* save the dataframes so that they can be loaded into our training notebook later down the line

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers 
import librosa 

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [9]:
from enum import Enum

# import the differnt types of data
class DataClass(Enum):
    SAFE = "safe"
    DANGEROUS = "dangerous"

dangerous = pd.DataFrame({
    'name': pd.Series(dtype='str'),
    'length (minutes)': pd.Series(dtype='int'),
    'path': pd.Series(dtype='str'),
    'class': pd.Series(dtype='category')
})

safe = pd.DataFrame({
    'name': pd.Series(dtype='str'),
    'length (minutes)': pd.Series(dtype='int'),
    'path': pd.Series(dtype='str'),
    'class': pd.Series(dtype='category')
})

In [13]:
import os

# get the data from the files
def load_dangerous_data():
    global dangerous
    for file in os.listdir(r"C:\Users\2005e\OneDrive\Documents\GitHub\TTC-Listen-2\models\data\dangerous"):
        if file.endswith(".wav"):
            y, sr = librosa.load(f"data/dangerous/{file}")
            duration = librosa.get_duration(y=y, sr=sr)
            dangerous = pd.concat([dangerous, pd.DataFrame({
                'name': [file],
                'length (minutes)': [duration / 60],
                'path': [f"data/dangerous/{file}"],
                'class': [DataClass.DANGEROUS]
            })], ignore_index=True)

def load_safe_data():
    global safe
    for file in os.listdir(r"C:\Users\2005e\OneDrive\Documents\GitHub\TTC-Listen-2\models\data\safe"):
        if file.endswith(".wav"):
            y, sr = librosa.load(f"data/safe/{file}")
            duration = librosa.get_duration(y=y, sr=sr)
            safe = pd.concat([safe, pd.DataFrame({
                'name': [file],
                'length (minutes)': [duration / 60],
                'path': [f"data/safe/{file}"],
                'class': [DataClass.SAFE]
            })], ignore_index=True)

load_dangerous_data()
load_safe_data()

In [14]:
dangerous.head()

Unnamed: 0,name,length (minutes),path,class
0,1 Hitter Quitter.wav,2.052451,data/dangerous/1 Hitter Quitter.wav,DataClass.DANGEROUS
1,14 year old boy wants to fight a grown man no ...,0.506195,data/dangerous/14 year old boy wants to fight ...,DataClass.DANGEROUS
2,ANGRY BLACK MAN ON NYC SUBWAY HATES WOMEN!.wav,9.825911,data/dangerous/ANGRY BLACK MAN ON NYC SUBWAY H...,DataClass.DANGEROUS
3,ANOTHER FIGHT ON THE 2 TRAIN NYC SMFH LMFAO! @...,2.828965,data/dangerous/ANOTHER FIGHT ON THE 2 TRAIN NY...,DataClass.DANGEROUS
4,Argument Ends With Dude Getting Jumped For Get...,3.342899,data/dangerous/Argument Ends With Dude Getting...,DataClass.DANGEROUS


In [15]:
safe.head()

Unnamed: 0,name,length (minutes),path,class
0,TTC Subway 100.wav,11.843889,data/safe/TTC Subway 100.wav,DataClass.SAFE
1,TTC Subway 101.wav,6.810994,data/safe/TTC Subway 101.wav,DataClass.SAFE
2,TTC Subway 103.wav,15.504545,data/safe/TTC Subway 103.wav,DataClass.SAFE
3,TTC Subway 104.wav,17.555556,data/safe/TTC Subway 104.wav,DataClass.SAFE
4,TTC Subway 105.wav,7.055166,data/safe/TTC Subway 105.wav,DataClass.SAFE


In [18]:
#print out total duration of both 
total_dangerous_duration = dangerous['length (minutes)'].sum()/60
total_safe_duration = safe['length (minutes)'].sum()/60

Total duration of dangerous data: 8.1030487654321 hours
Total duration of safe data: 24.453270975056686 hours


In [22]:
train_split = 0.8
validation_split = 0.2

In [23]:
def split_data_by_duration(data):
    data = data.sort_values(by='length (minutes)', ascending=False)
    train_duration = data['length (minutes)'].sum() * train_split
    train = pd.DataFrame()
    validation = pd.DataFrame()
    current_duration = 0

    for index, row in data.iterrows():
        if current_duration + row['length (minutes)'] <= train_duration:
            train = pd.concat([train, pd.DataFrame([row])], ignore_index=True)
            current_duration += row['length (minutes)']
        else:
            validation = pd.concat([validation, pd.DataFrame([row])], ignore_index=True)

    return train, validation

train_dangerous, validation_dangerous = split_data_by_duration(dangerous)
train_safe, validation_safe = split_data_by_duration(safe)

In [27]:
total_train_duration = train_dangerous['length (minutes)'].sum() + train_safe['length (minutes)'].sum()
total_train_duration / 60  # Convert to hours


np.float64(26.044210959939527)

In [28]:
train_dangerous.to_csv('train_dangerous.csv', index=False)
validation_dangerous.to_csv('validation_dangerous.csv', index=False)
train_safe.to_csv('train_safe.csv', index=False)
validation_safe.to_csv('validation_safe.csv', index=False)