# Process Folder to Data-Frames

In [None]:
import numpy as np
import random as rand
from scipy.io import wavfile
import numpy as np
import random as rand
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
from scipy import signal

def downsample(fs, data, target_samplerate):
    if target_samplerate == fs/2:
        output = []
        for i in range(len(data)):
            if i%2 == 0:
                output.append(data[i])
        output_samplerate = target_samplerate
    elif target_samplerate == fs:
        output = data
        output_samplerate = fs
    else: 
        output_samplerate = fs
    return output_samplerate, output

def make_mono(data):
    try:
        if len(data[0]) == 2:
            data = np.matrix.transpose(data)
            data = normalise(data[0] + data[1])
    except:
        data = data
    return data

def normalise(x):
    x = x/max(x)
    return(x)

def zero_cross_rate(fs, N, block):
    zero_crosses = np.nonzero(np.diff(block > 0))
    no_crosses = np.size(zero_crosses) * 0.5
    cross_rate = no_crosses * fs/N
    return cross_rate

def rms_value(block):
    rms = np.sqrt(np.mean(block**2))
    return rms

def spectral_centroid(block, samplerate):
    magnitudes = np.abs(np.fft.rfft(block)) # magnitudes of positive frequencies
    length = len(block)
    freqs = np.abs(np.fft.fftfreq(length, 1.0/samplerate)[:length//2+1]) # positive frequencies
    return np.sum(magnitudes*freqs) / np.sum(magnitudes) # return weighted mean

def spectral_entropy(block):
    BLOCK = np.fft.fft(block)
    P_w = (1/len(BLOCK))*abs(BLOCK)**2
    P_i = P_w/sum(P_w)
    
    PSE = 0
    for p in P_i:
        PSE += p * np.log(p)
    spec_entropy = -PSE
    return spec_entropy

def makeSpeechArray(dataIn, fs, windowLength, overlap, threshold):
    dataIn = normalise(dataIn)

    samplesInWindow = windowLength*fs
    Nw = overlap*samplesInWindow
    nWindows = len(dataIn)/Nw
    nWindows = int(np.floor(nWindows))

    speechArray = np.zeros(len(dataIn))
    for n in range(nWindows):
        A = int(round(n*Nw))
        B = int(round((n+1)*Nw))
        block = dataIn[A:B]
        block = np.hanning(len(block))*block

        rms = rms_value(block)
        db_rms = 20*np.log10(rms)

        if db_rms > threshold:
            speechArray[A:B] = 1
            
    return speechArray

def process_path(path):
    import wave, os, glob
    list_ = []
    if os.path.exists(path):
        for filename in glob.glob(os.path.join(path, '*.wav')):
            list_.append(filename)
    else:
         print(path + ' does not exist')   

    # ----------- process each file in the list --------- 
    from scipy.io import wavfile
    import numpy as np

    # 1. check for useful data 
    list2_ = []
    exceptions = 0
    for file in list_:
        try:
            fs, data = wavfile.read(file) # read 
            data = np.array(data, dtype=np.float32)
            list2_.append(file)
        except: 
            exceptions += 1
    print('no of exceptions: ' + str(exceptions))
    return list2_

class TimeFrameFeatures:
    
    def __init__(self, windowLength, windowOverlap, timeFrameLength, timeFrameOverlap):
        
        self.windowLength = windowLength
        self.windowOverlap = windowOverlap
        self.timeFrameLength = timeFrameLength
        self.timeFrameOverlap = timeFrameOverlap
    
    def process_data(self, fs, data, label_array):
        from python_speech_features import mfcc

        windowsPerFrame = int(round(self.timeFrameLength/self.windowLength))
        framesPerSecond = int(round(fs * self.timeFrameLength))
        NframesPerFile = int(len(data)/(framesPerSecond*self.timeFrameOverlap))

        self.threshold = 0
        self.timeFrames = []
        self.labelFrames = []
        self.MFCCFrames = []
        self.ZCRFrames = []
        self.CENTFrames = []
        self.RMSFrames = []
        self.SPEC_ENTFrames = []
        self.waveformFrames = []
        self.MFCCList = []
        self.absTimeList = []
        for frameNum in range(NframesPerFile):
            absTime = (frameNum*self.timeFrameOverlap*self.timeFrameLength)
            C = int(round((frameNum*self.timeFrameOverlap)*framesPerSecond))
            D = int(round((frameNum*self.timeFrameOverlap+1)*framesPerSecond))
            dataFrame = data[C:D]
            labels = label_array[C:D]
            label = max(labels)

            N = int(round(fs * self.windowLength))
            c = 1
            n = 1
            o = self.windowOverlap
            No_N = len(dataFrame)/(N*o)

            ZCR = []
            CENT = []
            RMS = []
            SPEC_ENT = []
            MFCC = []
            Time_Index = []
            Waveform = []

            NFFT = 0
            Size = True
            x = 0
            for n in range(0, int(No_N)):
                A = int(round((n*o)*N))
                B = int(round((n*o+c)*N))
                block = dataFrame[A:B]
                window = np.hanning(len(block))
                block = block * window

                cross_rate = zero_cross_rate(fs, N, block)
                spec_cent = spectral_centroid(block, fs)
                rms = rms_value(block)
                spec_entropy = spectral_entropy(block)

                while Size == True:
                    if len(block) > NFFT:
                        NFFT = 2**x
                    x += 1
                    if x == 13 or NFFT > len(block):
                        break
                mfcc_values = mfcc(block, fs, winlen=self.windowLength, nfft=NFFT, winstep=(self.windowLength)*self.windowOverlap)
                MFCC.append(mfcc_values)
                ZCR.append(cross_rate)
                CENT.append(spec_cent)
                RMS.append(rms)
                SPEC_ENT.append(spec_entropy)
                time = (n*o + c/2)*N*(1/fs)
                Time_Index.append(time)

            self.absTimeList.append(absTime)
            self.timeFrames.append(Time_Index)
            self.MFCCFrames.append(MFCC)
            Out = []
            for frame in MFCC:
                frame = frame[0]
                for f in frame:
                    Out.append(f)

            self.MFCCList.append([Out])
            self.ZCRFrames.append(ZCR)
            self.CENTFrames.append(CENT)
            self.RMSFrames.append(RMS)
            self.SPEC_ENTFrames.append(SPEC_ENT)
            self.waveformFrames.append(dataFrame) 
            self.labelFrames.append(label)

def make_title(files):
    out = []
    for f in files:
        head, tail = os.path.split(f)
        first = tail.split('.')[0]
        out.append(first)
    out = out[0] + '_' + out[1]
    return out
            
def process_to_dataframe(df, title, label, list_, obj):
    i = 0
    for file in list_:
        print('Processing.... ' + file)
        fs, data = wavfile.read(file) # read 
        data = np.array(data, dtype=np.float32)
        data = data/max(data)

        ## --------------------- Get Features -----------------------------
        obj.process_data(fs, data, rms_threshold=0)

        ## -------------- Append Features to a dataframe -------------------
        n = 0
        for time in obj.absTimeList:
            r = obj.RMSFrames[n]
            z = obj.ZCRFrames[n]
            c = obj.CENTFrames[n]
            s = obj.SPEC_ENTFrames[n]
            m = obj.MFCCList[n]
            m = m[0]
            t = obj.timeFrames[n]
            df = df.append({'RMS': r,
                            'ZCR': z, 
                            'CENT':c,
                            'SPEC_ENT':s, 
                            'MFCC':m,
                            'Label':label,
                            'Time':t,
                            'Absoloute Time': time,
                            'Description':title,
                            'File Name': file},
                            ignore_index=True)
            n += 1
        i += 1
        print('done ' + str(i) + '/' + str(len(list_)))
    return df

def process_to_dataframe_from_stream(df, title, fs, data, label_array, obj):
    ## --------------------- Get Features -----------------------------
    obj.process_data(fs, data, label_array)

    ## -------------- Append Features to a dataframe -------------------
    n = 0
    for time in obj.absTimeList:
        r = obj.RMSFrames[n]
        z = obj.ZCRFrames[n]
        c = obj.CENTFrames[n]
        s = obj.SPEC_ENTFrames[n]
        m = obj.MFCCList[n]
        label = int(obj.labelFrames[n])
        m = m[0]
        t = obj.timeFrames[n]
        df = df.append({'RMS': r,
                        'ZCR': z, 
                        'CENT':c,
                        'SPEC_ENT':s, 
                        'MFCC':m,
                        'Label':label,
                        'Time':t,
                        'Absoloute Time': time,
                        'Description':title},
                        ignore_index=True)
        n += 1
    print('done :' + title)
    return df

## Filtering 
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

def butter_bandpass_filter(data, fs, highpass=125, lowpass=4000, order=5):
    data = butter_highpass_filter(data, highpass, fs, order=order)
    data = butter_lowpass_filter(data, lowpass, fs, order=order)
    return data

# Main Body

In [None]:
Path = '/Volumes/RED_HD/DISSERTATION/Dataset2/90-10/Noise_Training'
Target_Path = '/Volumes/RED_HD/DISSERTATION/Dataset2/90-10/Data_Training'

# declare if path is speech data or noise data 
speech = False

FILES = process_path(Path)
target_samplerate = 22050

windowLength = 20/1000
timeFrameLength = 250/1000
overlap = 0.5
threshold = -25

NUMBER_GENERATED = 300
Operator = True
INDEX = 0
# loop should start here -----------------------------------------
for file in FILES:
    print(file)
    #try:
        # pick files at random
    fs, data = wavfile.read(file)
    data = make_mono(data)
    fs, data = downsample(fs, data, target_samplerate)
    data = normalise(data)
    data = butter_bandpass_filter(data, fs)

    # generate speech array
    if speech == True:
        speechArray = makeSpeechArray(data, fs, windowLength, overlap, threshold)
    else:
        speechArray = np.zeros(len(data))


    # weight background noise to be background noise
    # because the system does not need to be noise invariant 

    audio = normalise(data)

    # process output into acoustic parameters and save as a dataframe
    df = pd.DataFrame(columns=[
                            'Description',
                            'Label',
                            'Absoloute Time',
                            'RMS',
                            'ZCR', 
                            'CENT',
                            'SPEC_ENT', 
                            'MFCC',
                            'Time'])

    head, tail = os.path.split(file)
    title = tail.split('.')[0]

    featuresObject = TimeFrameFeatures(windowLength, overlap, timeFrameLength, overlap)
    df = process_to_dataframe_from_stream(df, title, fs, audio, speechArray, featuresObject)

    # save audiofile and dataframe:
    now = datetime.datetime.now()
    timestamp = str(now.minute)+'-'+str(now.hour)+'-'+str(now.day)+'-'+str(now.month)

    dataframe_location = Target_Path + '/' + title + '.pkl'
    df.to_pickle(dataframe_location)

    del df

    #except:
     #   print('------------------------------')
      #  print('FAILED : ')
       # print(file)
        #print('------------------------------')

print('done') 