In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
from scipy.signal import welch
from detect_peaks import detect_peaks
from scipy.stats import kurtosis
from scipy.stats import skew
import matplotlib.backends.backend_pdf
from sklearn import model_selection
from sklearn.metrics import classification_report
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
import copy

In [2]:
def GenerateIMSDictionary(FileOfInterest,TrainingDataFile,HomeDirectory):
    """
    returns UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    
    GenerateIMSDictionary(
        FileOfInterest - File name a single IMS file
        TrainingDataFile - File name of TrainingDataFile
        HomeDirectory - Directory with training file 
    )
    
    Example:
        n = 2000 / 60
        N = 16
        Bd = 0.331*254
        Pd = 2.815*254
        phi = 15.17 * np.pi / 180
        SampleFrequency = 20000
        FileName = FileOfInterest
        data = getIMSB1XData(FileName)
        HomeDirectory = HomeDirectory
        directory = os.listdir(HomeDirectory)
        TrainingDataFile = TrainingDataFile
        UserInput = UserInputs2WorkingForm(n,N,Bd,Pd,phi,SampleFrequency,FileName,data,HomeDirectory,directory,TrainingDataFile)

    This is the same as: GenerateIMSDictionary('2003.10.22.12.06.24',"TrainingData.csv",os.getcwd())
    
    The Purpose of this function is quickly hardcode parameters that would be taken from the GUI.
    """
    
    #Hardcoded Bearing Info
    n = 2000 / 60    #Shaft rotational speed [Hz], n
    N = 16  #No. of rolling elements [-], N
    Bd = 0.331*254 #Diameter of a rolling element [mm], Bd
    Pd = 2.815*254 #Pitch diameter [mm], Pd
    phi = 15.17 * np.pi / 180 #Contact angle [rad], Phi
    SampleFrequency = 20000 #Sampling Frequency
    FileName = FileOfInterest #Filename in string format 
    data = getIMSB1XData(FileName) #Get the bearing 1: X direction from the filename
    HomeDirectory = HomeDirectory #Homedirectory
    directory = os.listdir(HomeDirectory) #Files HomeDirectory
    TrainingDataFile = TrainingDataFile #String of File Name that is the Training file csv
    UserInput = UserInputs2WorkingForm(n,N,Bd,Pd,phi,SampleFrequency,FileName,data,HomeDirectory,directory,TrainingDataFile)

    return UserInput

In [3]:
def UserInputs2WorkingForm(n,N,Bd,Pd,phi,SampleFrequency,FileName,RawData,HomeDirectory,directory,TrainingDataFile):
    """
    Returns a dictionary of relevant file information
    
    UserInputs2WorkingForm(
        n - Shaft rotational speed [Hz], n
        N - No. of rolling elements [-], N
        Bd - Diameter of a rolling element [mm], Bd
        Pd - Pitch diameter [mm], Pd
        phi - Contact angle [rad], Phi
        SampleFrequecy - SampleFrequency,
        FileName - Title of file containing the raw data,
        RawData - Array containing the raw data
        HomeDirectory - Location of the Current Directory
        directory - Directory of the FileOfInterest
        TrainingDataFile - Title of CSV containg the training data for the machine learning
        )
        
    This functions serves to take all relevant motor characteristics and puts them in a 
    dictionary.
    This dictionary will serve as the building blocks for the rest of the functions.
    """
    #Get Extra Info
    NumberOfSamples = len(RawData)
    dt = 1/SampleFrequency
    Tmax = dt*NumberOfSamples
    
    #Arrange
    x = {
        'n': n, #Shaft rotational speed [Hz], n
        'N': N, #No. of rolling elements [-], N
        'Bd': Bd, #Diameter of a rolling element [mm], Bd
        'Pd': Pd, #Pitch diameter [mm], Pd
        'Phi': phi, #Contact angle [rad], Phi
        'Sampling Frequency': SampleFrequency,
        'Time of Sampling': Tmax,
        'Number of Samples': NumberOfSamples,
        'File of Interest': FileName,
        'HomeDirectory': HomeDirectory,
        'Working Directory': directory,
        'TrainingFileName': TrainingDataFile,
        'Signal Data of Interest': RawData   
    }
    return x

In [4]:
def getIMSB1XData(FileOfInterest):
    """
    Returns an 1-D array of IMS data
    
    Subfunction for UserInputs2WorkingForm
    getIMSB1XData(
        FileOfInterest - Title of file containing the raw data from the IMS dataset
    )
    This function reads the IMS bearing dataset for set 1 that was taken from:
    http://data-acoustics.com/measurements/bearing-faults/bearing-4/
    """
    #Get Data
    data = pd.read_table(FileOfInterest,header = None)
    data.columns = ['b1x','b1y','b2x','b2y','b3x','b3y','b4x','b4y']
    return np.transpose(data.values[:,0])

In [5]:
def getValuesFromRawData(filename):
    """
    Returns time, amp (tuple) which are both arrays of floats
    
    getValuesFromRawData(
        filename - Name of file containing raw data 
        )
    
    This functions get the time and amplitude from the Raw data from the microcontroller
    """
    
    # Read Data
    dataset = pd.read_csv(filename, header = None, index_col = False)
    dataset.rename(columns={0: "Time", 1: "Value"}, inplace = True)

    #Prepare Raw Data
    i = 0
    
    #Instantiate
    time = []
    amp = []
    
    
    while i < len(dataset['Time'].values):
        i += 1
        
        #Break when seeing the end line of 0 0 
        #Else append
        if dataset['Time'].values[i] == 0 and dataset['Value'].values[i] == 0:
            break
        else:
            time.append(dataset['Time'].values[i])
            amp.append(dataset['Value'].values[i])
            
    return time,amp

In [6]:
def ReplaceSignalDataofInterest(Data,UserInput,filename):
    """
    Returns a dictionary of relevant file information
    
    UserInputs2WorkingForm(
        Data - This is the actual data
        UserInput - UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        filename - 
        )
        
    This functions replaces the signal data of interest value along with the file name of a dictionary.
    This function was created to "cheat" the system and allow the generation of training data 
    by manually inputting the actual data. This helps becuase right now the GetIMSB1XData() function
    only gets B1X column for simplicity.
    """
    
    #Copy
    x = UserInput.copy()
    
    #Replace
    x['Signal Data of Interest'] = Data 
    x['File of Interest'] = filename

    return x

In [7]:
def BearingInfomation(UserInput):
    """
    Returns a dictionary with Bearing Characteristic Frequencies
    
    BearingInfomation(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    This functions calculates the bearing characteristic frequencies
    """
    #Get Needed Info
    n = UserInput['n']
    N = UserInput['N']
    Bd = UserInput['Bd']
    Pd = UserInput['Pd']
    phi = UserInput['Phi']
    
    #Calculate Bearing Frequncies using known equations
    xx = Bd/Pd*np.cos(phi)
    BPFI = (N/2)*(1 + xx)*n
    BPFO = (N/2)*(1 - xx)*n
    BSF = (Pd/(2*Bd))*(1-(xx)**2)*n
    FTF= (1/2)*(1 - xx)*n
    
    #Arrange
    x = {
        "BPFI": BPFI,
        "BPFO": BPFO,
        "BSF":  BSF,
        "FTF":  FTF
    }
    return x

In [8]:
def RemoveDCOffset(UserInput):
    """
    Returns a modified dictionary
    
    RemoveDCOffset(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    This functions removes the dc bias from the signal in the UserInput dictionary
    """
    #Copy
    temp = UserInput.copy()
    
    #Modify
    temp["Signal Data of Interest"] = temp["Signal Data of Interest"] - np.mean(temp["Signal Data of Interest"])
    
    return temp

In [9]:
def FourierTransform(UserInput):
    """
    Returns a dictionary what contains the frequency and frequency amplitude arrays
    
    FourierTransform(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    )
    
    The functions perfroms fast fourier transform on the UserInput Signal 
    Data of Interest
    """

    #Get Needed Info
    sig = UserInput['Signal Data of Interest']
    NumberOfSamples = UserInput['Number of Samples']
    Tmax = UserInput['Time of Sampling']
    
    #Fourier Transform
    frq = np.arange(NumberOfSamples)/(Tmax)# two sides frequency range
    frq = frq[range(int(NumberOfSamples/(2)))] # one side frequency range
    Y = abs(np.fft.fft(sig))/NumberOfSamples # fft computing and normalization
    Y = Y[range(int(NumberOfSamples/2))]
    
    #Arrange
    x = {
        "Frequency":frq,
        "Freq. Amp.": Y
        }
    return x

In [10]:
def get_psd_values(UserInput):
    """
    Returns a dictionary that contains the frequency and the frequency amplitude arrays
    
    get_psd_values(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    )
    
    The functions perfroms power spectrum density on the UserInput Signal 
    Data of Interest
    """
    #Get Needed Info
    sig = UserInput['Signal Data of Interest']
    SamplingFrequency = UserInput['Sampling Frequency']
    
    #Perfrom psd
    frq, psd_values = welch(sig, fs=SamplingFrequency)
    
    #Arrange
    x = {
        "Frequency":frq,
        "PSD": psd_values
        }
    return x

In [11]:
def autocorr(x):
    """
    Taken from:
    https://ipython-books.github.io/103-computing-the-autocorrelation-of-a-time-series/
    
    Returns the autocorrelation of the signal x
    
    autocorr(
        x - signal of interest
        )
    
    This functions performs correlation
    """
    result = np.correlate(x, x, mode='full')
    return result[len(result)//2:]

In [12]:
def get_autocorr_values(UserInput):
    """
    Modified from: 
    https://ipython-books.github.io/103-computing-the-autocorrelation-of-a-time-series/
    
    Returns a dictionary
    
    get_autocorr_values(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    )
    
    The functions perfroms autocorrelation on the UserInput Signal 
    Data of Interest
    """
    #Get needed info
    sig = UserInput['Signal Data of Interest']
    Tmax = UserInput['Time of Sampling']
    N = UserInput['Number of Samples']
    
    #Call correlation function
    autocorr_values = autocorr(sig)
    
    #Arrange
    x_values = np.array([Tmax * jj for jj in range(0, N)])
    x = {
        "X Values":x_values,
        "Autocorr Values": autocorr_values
        }
    return x

In [13]:
def TimeDomainInformation(UserInput):
    """
    Returns a dictionary with Time Domain Characteristics
    
    TimeDomainInformation(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    This functions calculates the Time Domain Characteristics
    """
    #Get Needed Info
    sig = UserInput['Signal Data of Interest']
    
    #Arrange
    x = {
        "RMS": np.mean(sig**2),
        "STD": np.std(sig),
        "Mean": np.mean(sig),
        "Max": np.max(sig),
        "Min": np.min(sig),
        "Peak-to-Peak": (np.max(sig) - np.min(sig)),
        "Max ABS": np.max(abs(sig)),
        "Kurtosis": kurtosis(sig),
        "Skew": skew(sig),
    }

    return x

In [14]:
def Magnitude(Y):
    """
    Returns a float that is the magnitude of the array Y
    
    Magnitude(
     Y - an array of numbers to get the magnitude
    )
    """
    
    #Square
    mag = 0
    for i in range(0,len(Y)):
        mag = mag + Y[i]**2
        
    #Square Root
    mag = mag ** 0.5
    
    return mag

In [15]:
def Normalize(UserInput):
    """
    Returns a dictionary
    
    Normalize(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    )
    """
    
    #Copy
    x = UserInput.copy()
    
    #Get Magnitude
    mag = Magnitude(x['Signal Data of Interest'])
    
    #Normalize
    x['Signal Data of Interest'] = x['Signal Data of Interest'] / mag

    return x

In [16]:
def PosMagnitude(Y):
    """
    Returns a float that is the Posmagnitude of the array Y
    
    Magnitude(
     Y - an array of numbers to get the magnitude
    )
    
    Get the magnitude of all numbers greater than 0
    """
    
    #Square if positive
    mag = 0
    for i in range(0,len(Y)):
        if Y[i] > 0:
            mag = mag + Y[i]**2
            
    #Square Root
    mag = mag ** 0.5
    
    return mag

In [17]:
def plotPeaks(X,Y,xlabel,ylabel,Title):
    """
    Returns a figure
    
    plotPeaks(
        X - Independent data array
        Y - Dependent data array
        xlabel - string for the x-axis label
        ylabel - string for the y-axis label
        Title - string the the Title
    )
    
    This function plots the data with the peaks indicated
    """
    #Set Parameters
    Ymag = PosMagnitude(Y)
    Ynew = Y/Ymag
    min_peak_height = .04
    threshold = 0.15*np.std(Ynew)
    
    #Get indices of peak
    peak = detect_peaks(Ynew,edge = 'rising',mph = min_peak_height, mpd = 5, threshold = threshold )
    
    #Plot base figure
    fig = plt.figure()
    plt.plot(X,Y)

    #Plot Scatter
    for i in peak:
       plt.scatter(X[i],Y[i], c= 'r', marker='*',s = 80)
    
    #Set Graph Features
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.title(Title)
    plt.grid(True)
    plt.show()
    
    return fig

In [18]:
def GetSortedPeak(X,Y):
    """
    SubFunction for FrequencyDomainInformation
    
    Returns Amplitude of Y, Loctation
    
    GetSortedPeak(
        X - Independent Variable
        Y - Dependent Variable
        )
        
    Uses detect_peaks function taken from Github:
    __author__ = "Marcos Duarte, https://github.com/demotu/BMC"
    
    Get the indices of relevant peaks
    Then Returns the amplitude,location of the relevant peaks
    """
    #Original
    """
    #Set Parameters
    min_peak_height = 0.1 * np.nanmax(Y) #Original
    threshold = 0.05 * np.nanmax(Y) #Original
    
    #Get indices of peak
    peak = detect_peaks(Y,edge = 'rising',mph = min_peak_height, mpd = 2, threshold = threshold ) #Original
    """
    #NEW
    #Set Parameters
    Ymag = PosMagnitude(Y)
    Ynew = Y/Ymag
    min_peak_height = .04
    threshold = 0.15*np.std(Ynew)
    
    #Get indices of peak
    peak = detect_peaks(Ynew,edge = 'rising',mph = min_peak_height, mpd = 5, threshold = threshold )
    
    #Get values corresponding to indices 
    m = []
    mm = []
    for i in peak:
        m.append(Y[i]) 
        mm.append(X[i])

    #Sort arcording to the amplitude
    mmm = np.argsort(m)
    n = []
    nn = []
    for i in mmm:
        n.append(m[i])
        nn.append(mm[i])
    
    #Sort in Descending Amplitdue while keeping locations matched
    n  = n[::-1] #amplitude
    nn = nn[::-1] #location
    
    #Arrange
    return n, nn

In [19]:
def FrequencyDomainInformation(UserInput):
    """
    Returns a dictionary with Frequency Domain Characteristics
    Top 5 frequncy and amplitudes for:
    fft
    psd
    correlation
    
    FrequencyDomainInformation(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    Note: 77777 refers to a blank cell
        We had to fill in blank cells to work with pandas dataframe
    """
    #Call FFT, PSD, and Correlation Values
    x1 = FourierTransform(UserInput)
    x2 = get_psd_values(UserInput)
    x3 = get_autocorr_values(UserInput)
    FTamp,FTfreq = GetSortedPeak(x1['Frequency'],x1['Freq. Amp.'])
    PSDamp,PSDfreq = GetSortedPeak(x2['Frequency'],x2['PSD'])
    Cor,CorTime = GetSortedPeak(x3['X Values'],x3['Autocorr Values'])

    #Originally -999 
    #Now 77777
    #Take Care of Empty Values
    while len(FTamp) <= 5:
        FTamp.append(['77777'])
    while len(FTfreq) <= 5:
        FTfreq.append(['77777'])
    while len(PSDamp) <= 5:
        PSDamp.append(['77777'])
    while len(PSDfreq) <= 5:
        PSDfreq.append(['77777'])
    while len(Cor) <= 5:
        Cor.append(['77777'])
    while len(CorTime) <= 5:
        CorTime.append(['77777'])
    
    #Arrange
    x = {
        "FFT Frq @ Peak 1": FTfreq[0],
        "FFT Frq @ Peak 2": FTfreq[1],
        "FFT Frq @ Peak 3": FTfreq[2],
        "FFT Frq @ Peak 4": FTfreq[3],
        "FFT Frq @ Peak 5": FTfreq[4],
        "FFT Amp @ Peak 1": FTamp[0],
        "FFT Amp @ Peak 2": FTamp[1],
        "FFT Amp @ Peak 3": FTamp[2],
        "FFT Amp @ Peak 4": FTamp[3],
        "FFT Amp @ Peak 5": FTamp[4],
        "PSD Frq @ Peak 1": PSDfreq[0],
        "PSD Frq @ Peak 2": PSDfreq[1],
        "PSD Frq @ Peak 3": PSDfreq[2],
        "PSD Frq @ Peak 4": PSDfreq[3],
        "PSD Frq @ Peak 5": PSDfreq[4],
        "PSD Amp @ Peak 1": PSDamp[0],
        "PSD Amp @ Peak 2": PSDamp[1],
        "PSD Amp @ Peak 3": PSDamp[2],
        "PSD Amp @ Peak 4": PSDamp[3],
        "PSD Amp @ Peak 5": PSDamp[4],
        "Autocorrelate Time @ Peak 1": CorTime[0],
        "Autocorrelate Time @ Peak 2": CorTime[1],
        "Autocorrelate Time @ Peak 3": CorTime[2],
        "Autocorrelate Time @ Peak 4": CorTime[3],
        "Autocorrelate Time @ Peak 5": CorTime[4],
        "Autocorrelate @ Peak 1": Cor[0],
        "Autocorrelate @ Peak 2": Cor[1],
        "Autocorrelate @ Peak 3": Cor[2],
        "Autocorrelate @ Peak 4": Cor[3],
        "Autocorrelate @ Peak 5": Cor[4]
    }
    return x

In [20]:
def getAbsoluteTime(file):
    """
    Subfunction for StateInformation
    
    Returns the "magnitude" of the time stamp 
    
    getAbsolutTime(
        file - file name that has bearing information within it
        )
    
    This function computes the magnitude of time when the IMS data was taken 
    """
    #Get needed info
    year   = int(file[0:4])
    month  = int(file[5:7])
    day    = int(file[8:10])
    hour   = int(file[11:13])
    minute = int(file[14:16])
    second = int(file[17:19])
    
    #Compute starting from the 10 month
    #in seconds don't include years taking 10 as the start month
    x = second + 60*minute + 60*60*hour + 24*60*60*day + 31*24*60*60*(month - 10)
    return x


In [21]:
"""
http://mkalikatzarakis.eu/wp-content/uploads/2018/12/IMS_dset.html
Previous work done on this dataset states that seven different states of health were observed:

Early (initial run-in of the bearings)
Normal
Suspect (the health seems to be deteriorating)
Imminent failure (for bearings 1 and 2, which didn’t actually fail, but were severely worn out)
Inner race failure (bearing 3)
Rolling element failure (bearing 4)
Stage 2 failure (bearing 4)
For the first test (the one we are working on), the following labels have been proposed per file:

Bearing 1
early: 2003.10.22.12.06.24 - 2013.10.23.09.14.13
suspect: 2013.10.23.09.24.13 - 2003.11.08.12.11.44 (bearing 1 was in suspicious health from the beginning, but showed some self-healing effects)
normal: 2003.11.08.12.21.44 - 2003.11.19.21.06.07
suspect: 2003.11.19.21.16.07 - 2003.11.24.20.47.32
imminent failure: 2003.11.24.20.57.32 - 2003.11.25.23.39.56

Bearing 2
early: 2003.10.22.12.06.24 - 2003.11.01.21.41.44
normal: 2003.11.01.21.51.44 - 2003.11.24.01.01.24
suspect: 2003.11.24.01.11.24 - 2003.11.25.10.47.32
imminent failure: 2003.11.25.10.57.32 - 2003.11.25.23.39.56

Bearing 3
early: 2003.10.22.12.06.24 - 2003.11.01.21.41.44
normal: 2003.11.01.21.51.44 - 2003.11.22.09.16.56
suspect: 2003.11.22.09.26.56 - 2003.11.25.10.47.32
Inner race failure: 2003.11.25.10.57.32 - 2003.11.25.23.39.56

Bearing 4
early: 2003.10.22.12.06.24 - 2003.10.29.21.39.46
normal: 2003.10.29.21.49.46 - 2003.11.15.05.08.46
suspect: 2003.11.15.05.18.46 - 2003.11.18.19.12.30
Rolling element failure: 2003.11.19.09.06.09 - 2003.11.22.17.36.56
Stage 2 failure: 2003.11.22.17.46.56 - 2003.11.25.23.39.56
"""

def StateInformation(UserInput,BearingNum):
    """
    Returns a Dictionary of a Bearing State
    
    StateInformation(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        BearingNum - Bearing Num to know which failure type
        )
    
    This function is used to generate a known outcome for the training data.
    This function is only intended to aid in generating the training data.
    """
    #Get Needed Info 
    file = UserInput['File of Interest']
    
    #Comput time for comarison
    absolutetime = getAbsoluteTime(file)
    
    #Transitions according to the above comments
    #Bearing 1 transitions
    b1e2s  = getAbsoluteTime("2013.10.23.09.14.13")
    b1s2n  = getAbsoluteTime("2003.11.08.12.11.44")
    b1n2s  = getAbsoluteTime("2003.11.19.21.06.07")
    b1s2i  = getAbsoluteTime("2003.11.24.20.47.32")
    
    #Bearing 2 transitions
    b2e2n  = getAbsoluteTime("2003.11.01.21.41.44")
    b2n2s  = getAbsoluteTime("2003.11.24.01.01.24")
    b2s2i  = getAbsoluteTime("2003.11.25.10.47.32")
    
    #Bearing 3 transitions
    b3e2n  = getAbsoluteTime("2003.11.01.21.41.44")
    b3n2s  = getAbsoluteTime("2003.11.22.09.16.56")
    b3s2irf  = getAbsoluteTime("2003.11.25.10.47.32")
    
    #Bearing 4 transitions
    b4e2n  = getAbsoluteTime("2003.10.29.21.39.46")
    b4n2s  = getAbsoluteTime("2003.11.15.05.08.46")
    b4s2r  = getAbsoluteTime("2003.11.18.19.12.30")
    b4r2f  = getAbsoluteTime("2003.11.22.17.36.56")
    
    #Get state / output error if no state possible
    m = "ERROR"
    if BearingNum == 1:
        if absolutetime   <= b1e2s:
            m = "Early"
        elif absolutetime <= b1s2n:
            m = "Suspect"
        elif absolutetime <= b1n2s:
            m = "Normal"
        elif absolutetime <= b1s2i:
            m = "Suspect"
        elif absolutetime > b1s2i:
            m = "Imminent Failure"
    elif BearingNum == 2:
        if absolutetime   <= b2e2n:
            m = "Early"
        elif absolutetime <= b2n2s:
            m = "Normal"
        elif absolutetime <= b2s2i:
            m = "Suspect"
        elif absolutetime > b2s2i:
            m = "Imminent Failure" 
    elif BearingNum == 3:
        if absolutetime   <= b3e2n:
            m = "Early"
        elif absolutetime <= b3n2s:
            m = "Normal"
        elif absolutetime <= b3s2irf:
            m = "Suspect"
        elif absolutetime >= b3s2irf:
            m = "Inner Race Failure"   
    elif BearingNum == 4:
        if absolutetime   <= b4e2n:
            m = "Early"
        elif absolutetime <= b4n2s:
            m = "Normal"
        elif absolutetime <= b4s2r:
            m = "Suspect"
        elif absolutetime <= b4r2f:
            m = "Rolling Element Failure"
        elif absolutetime > b4r2f:
            m = "Stage 2 Failure"
    else:
        m = "ERROR"
    
    #NOT in the original model
    State2Int = StateDict()
    
    #Arrange
    x = {
        "State": State2Int[m]
    }
    return x

In [22]:
def MotorInformation(UserInput):
    """
    Returns a Dictionary containg motor characteristics used in the IMS dataset
    
    MotorInformation(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    Only valid for IMS dataset
    """
    """
    WILL NEED TO BE UPDATED GOING FORWARD
    """
    x = {
        "Motor Type AC(1)-DC(0)": 1,
        "Shaft Speed [Hz]": 2000/60
    }
    return x

In [23]:
def getCompleteDataFrame(UserInput,BearingNum):
    """
    Returns a Dataframe for sample
    
    getCompleteDataFrame(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        BearingNum - Bearing Num to know which failure type
        )
    
    This function is used to generate a known outcome for the training data.
    This function is only intended to aid in generating the training data.
    """
    #Call specific function order for consistency 
    UserInput1 = UserInput.copy()
    UserInput2 = RemoveDCOffset(UserInput1)
    UserInput3 = Normalize(UserInput2)
    BearingInfo = BearingInfomation(UserInput3)
    TimeDomainInfo = TimeDomainInformation(UserInput3)
    FrequecyDomainInfo = FrequencyDomainInformation(UserInput3)
    StateInfo = StateInformation(UserInput3,BearingNum)
    MotorInfo = MotorInformation(UserInput3)
    
    #Arrange
    Features = {**StateInfo,**MotorInfo,**BearingInfo,**TimeDomainInfo,**FrequecyDomainInfo}
    Features = pd.DataFrame(Features, index=[0])
    return Features 

In [24]:
def getTESTDataFrame(UserInput):
    """
    Returns a Dataframe that does not need the state
    
    getTESTDataFrame(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
    
    This function generates a dataframe without knowing its state
    This function calls functions in the consistent order
    """
    #Call specific function order for consistency 
    UserInput1 = UserInput.copy()
    UserInput2 = RemoveDCOffset(UserInput1)
    UserInput3 = Normalize(UserInput2)
    BearingInfo = BearingInfomation(UserInput3)
    TimeDomainInfo = TimeDomainInformation(UserInput3)
    FrequecyDomainInfo = FrequencyDomainInformation(UserInput3)
    MotorInfo = MotorInformation(UserInput3)
    
    #Arrange (with no state info)
    Features = {**MotorInfo,**BearingInfo,**TimeDomainInfo,**FrequecyDomainInfo}
    Features = pd.DataFrame(Features, index=[0])
    return Features 

In [25]:
def getTESTMatrix(UserInput):
    """
    Returns an array that can be directly plugged into Predict()
    
    getTestMatrix(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
    )
    """
    
    Features = getTESTDataFrame(UserInput)
    
    #Correct the type of data
    return Features.values[:,0:(Features.shape[1]-1)]

In [26]:
def getPlot(X,Y,xlabel,ylabel,Title):
    """
    Subfunction of getGraphs
    Returns a figure
    
    getPlot(
        X - Data for independent variable
        Y - Data for dependent variable
        xlabel - X-axis label
        ylabel - Y-axis label
        Title - Title of figure
        )
    
    Performs plt.plot
    """
    
    #Plot
    fig = plt.figure()
    plt.plot(X,Y,c = np.random.rand(3,))
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.title(Title)
    plt.grid(True)
    
    return fig

In [27]:
def getGraphs(UserInput):
    """
    Returns an array of figures
    
    getGraphs(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
    
    This function generates a figures for:
    Raw time series
    Time series with no DC offset
    FFT
    PSD
    Correlation
    """
    #Create time series array
    t = np.arange(0,UserInput['Time of Sampling'],1/UserInput['Sampling Frequency'])
    
    #Perform FFT, PSD, Correlation, DC Offset
    UserInput1 = RemoveDCOffset(UserInput)
    UserInput2 = Normalize(UserInput1)
    x1 = FourierTransform(UserInput2)
    x2 = get_psd_values(UserInput2)
    x3 = get_autocorr_values(UserInput2)
    
    
    #Get Figures
    figs = []
    figs.append(getPlot(t,UserInput['Signal Data of Interest'],"time (s)","Amplitude","Raw Data"))
    figs.append(getPlot(t,UserInput1['Signal Data of Interest'],"time (s)","Amplitude","Raw Data w/ Removed DC Offset"))
    figs.append(getPlot(t,UserInput2['Signal Data of Interest'],"time (s)","Amplitude","Normalized Raw Data"))
    figs.append(getPlot(x1['Frequency'],x1['Freq. Amp.'],'Frequency [Hz]',"time (s)","FFT"))
    figs.append(getPlot(x2['Frequency'],x2['PSD'],'Frequency [Hz]','PSD [V**2 / Hz]',"PSD"))
    figs.append(getPlot(x3['X Values'],x3['Autocorr Values'],'time delay [s]',"Autocorrelation amplitude","Autocorrelation"))

    return figs

In [28]:
def getBarPlot(X,Y,xlabel,Title):
    """
    Subfunction of getGraphs
    Returns a figure
    
    getBarPlot(
        X - Data for independent variable
        Y - Data for dependent variable
        xlabel - X-axis label

        Title - Title of figure
        )
    
    Performs plt.barh
    """
    #Bar plot
    fig = plt.figure()
    y_pos = np.arange(len(Y))
    plt.barh(y_pos, X, align='center')
    plt.xlabel(xlabel, fontsize=12)
    plt.yticks(y_pos, Y)
    plt.title(Title)
    plt.grid(True)
    return fig

In [29]:
def truncate(f, n):
    '''https://stackoverflow.com/questions/783897/truncating-floats-in-python/51172324#51172324'''
    '''Truncates/pads a float f to n decimal places without rounding'''
    s = '{}'.format(f)
    if 'e' in s or 'E' in s:
        return '{0:.{1}f}'.format(f, n)
    i, p, d = s.partition('.')
    return '.'.join([i, (d+'0'*n)[:n]])

In [30]:
def GetSplitTrainingData(UserInput, seed = 6):
    """
    Returns an X_train, X_test, Y_train, Y_test
    
    getGraphs(UserInput)
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        seed - random number for splitting of test and trainig (default = 6)
        )
    
    This returns the training and test sets
    """
    
    #Find training file name and read it
    for file in UserInput['Working Directory']:
        if file == UserInput['TrainingFileName']:
            dataset = pd.read_csv(file,header = 0,index_col = 0)

    #Get the values
    X = dataset.values[:,1:(dataset.shape[1]-1)]
    Y = dataset.values[:,0]
    
    #Set splitting parameters
    validation_size = 0.20
    seed = seed
    
    #Split data
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) 
    
    return X_train, X_test, Y_train, Y_test

In [31]:
def GetTrainingData(UserInput):
    """
    Returns X_train, Y_train
    
    getGraphs(UserInput)
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
    
    This returns the training and test sets
    """
    
    #Find training file name and read it
    for file in UserInput['Working Directory']:
        if file == UserInput['TrainingFileName']:
            dataset = pd.read_csv(file,header = 0,index_col = 0)

    #Return the entire sets
    X_train = dataset.values[:,1:(dataset.shape[1]-1)]
    Y_train = dataset.values[:,0]
    
    return X_train, Y_train, dataset

In [32]:
def GetTESTDataFrameNames(UserInput):
    """
    Returns an array of strings
    
    GetTESTDataFrameNames(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
    
    This returns the names of each column of the training file
    """
    
    #Find training file name and read it
    for file in UserInput['Working Directory']:
        if file == UserInput['TrainingFileName']:
            dataset = pd.read_csv(file,header = 0,index_col = 0)
            
    #Instantiate
    names = []
    
    #Get names
    for x in dataset.columns:
        names.append(x)
    
    return names

In [33]:
def TrainModel(X_train,Y_train):
    """
    Returns a classifier that has been fit
    
    TrainModel(
        X_train - Training Data
        Y_train - Results of Training Data for supervised learning
        )
    
    Currently only fits RandomForestClassifier
    """
    
    #Fit final model
    classifier = RandomForestClassifier(min_samples_split= 10 ,n_estimators = 200)
    classifier = classifier.fit(X_train, Y_train)
    
    
    return classifier

In [34]:
def get_key(value,dictionary): 
    """
    
    Modified from:
    https://www.geeksforgeeks.org/python-get-key-from-value-in-dictionary/
    """
    
    #Return key if the value in dictionary is the predetermined value
    result = []
    for x in value:
        for key, val in dictionary.items(): 
             if val == x:
                result.append( key )

    return result

In [35]:
def PredictModel(classifier,X_test):
    """
    Returns a tuple of prediction in integer form, string form
    
    PredictModel(
        classifier - fitted classifier
        X_test - data to be tested
        )
    """
    #Get key-value relationship
    State2Int = StateDict()
    
    #Predict
    Y_test_pred = classifier.predict(X_test)
    
    #Get string (key) of the prediction
    Y_test_pred_string = get_key(Y_test_pred,State2Int)
    
    return Y_test_pred, Y_test_pred_string

In [36]:
def PredictProbModel(classifier,X_test):
    """
    Returns a prediction probability (out of 100 not 1)
    
    PredictModel(
        classifier - fitted classifier
        X_test - data to be tested
        )
    """
    """
    COMBINE WITH PREDICT MODEL GOING FORWARD
    """
    
    #Get the probability prediction
    Y_test_pred_proba = classifier.predict_proba(X_test)
    
    return Y_test_pred_proba*100

In [37]:
def GetUserInputNames(UserInput):
    """
    Returns an array of strings
    
    GetUserInputNames(
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
    
    This returns the names of the columns in UserInput
    """
    #Instantiate
    names = []
    
    #Append names
    for x in UserInput:
        names.append(x)
    
    return names

In [38]:
def FeatureComparison(models,X_train, X_test, Y_train, Y_test,UserInput):
    """
    returns tuple: 
        results - an array of classification results
        string - an array of classifier names and classification results
        string1 - an array of classifiers that do not have feature importance graphs
        time - an array of time stamps recording how long a fit/prediction/classification
                result took for an figure
        fig - an array of figures containing feature importance for classifiers
        
    FeatureComparison(
        models - an array of classifiers for testing
        X_train - Training Data
        X_test - Data to be fitted
        Y_train - Results of Training Data for supervised learning
        Y_test - results of the fitted data
        UserInput - Dictionary of relevant info (see UserInputs2WorkingForm)
        )
        
    The function usefullness lies in its ability to test mass quantities of classifiers
    """
    
    #Instantiate
    results = []
    string = []
    string1 = []
    fig = []
    time = []
    
    #Apppend when appropriate
    for ModelName in models:
        
        #Time process time
        before = datetime.now()
        
        #Fit Model
        error = False
        try:
            CTest = models[ModelName].fit(X_train, Y_train)
        except:
            print()
            error = True
        
        #Classification report
        if not error:
            try:
                Y_pred = CTest.predict(X_test)
                temporary = classification_report(Y_test,Y_pred)
                results.append(temporary)
                #CTest.score(X_test, Y_test))) #optional metric
                string.append("{} has the following results: \n\n {} \n\n".format(ModelName,temporary))
            except:
                try:
                    Y_pred = CTest.predict(X_test)
                    temporary = classification_report(Y_test,np.round_(Y_pred)) #try rounding Y_pred
                    results.append(temporary)
                    #CTest.score(X_test, Y_test))) #optional metric
                    string.append("{} has the following results: \n\n {} \n\n".format(ModelName,temporary))
                except:    
                    string.append("{} failed during classification_report.".format(ModelName))
        else:
            string.append("{} failed during the fit.".format(ModelName))
        
        #Time fitting and scoring
        end = datetime.now()
        time.append("{} took {} time".format(ModelName,(end-before)))
        
        #Try figures
        try:
            m = CTest.feature_importances_
            m1 = GetTESTDataFrameNames(UserInput)
            Z = [x for _,x in sorted(zip(m,m1))]
            Z1 = sorted(m)
            fig.append(getBarPlot(Z1[-10:],Z[-10:],"Relative Importance",ModelName))
        except:
            string1.append("{} has no feature importance".format(ModelName)) 
            
    return results,string,string1,time,fig

In [39]:
def GenerateComparisonResultFiles(results,string,string1,time,fig,str1 = "Graphs.pdf",\
                                  str2 = "Time.txt",str3 = "NoGraphs.txt",str4 = "Scoring.txt"):
    """
    returns true value upon completion
    
    GenerateComparisonResultFiles(
        results - an array of classification results
        string - an array of classifier names and classification results
        string1 - an array of classifiers that do not have feature importance graphs
        time - an array of time stamps recording how long a fit/prediction/classification
                result took for an figure
        fig - an array of figures containing feature importance for classifiers
        str1 - title of pdf file relating to figs (Default: "Graphs.pdf")
        str2 = title of txt file relating to time (Default: Time.txt")
        str3 = title of txt file relating to string1 (Default: "NoGraphs.txt")
        str4 = title of txt file relating to string (Default: "Scoring.txt")
        )
        
    The function generates 4 files.
    This function is to be used after FeatureComparison()
    """
    #PDF of figures
    pdf = matplotlib.backends.backend_pdf.PdfPages(str1)
    i = 0
    for figure in fig:
        pdf.savefig( fig[i],dpi=300, bbox_inches = "tight")
        i += 1
    pdf.close()

    #.txt of Time
    if not(not time):
        with open(str2, 'w') as writeFile:
            for i in np.arange(len(time)):
                writeFile.write("%(t)s\n" % {"t":time[i]})
    writeFile.close()   

    #.txt of NoGraphs
    if not(not string1):
        with open(str3, 'w') as writeFile:
            for i in np.arange(len(string1)):
                writeFile.write("%(t)s\n" % {"t":string1[i]})
    writeFile.close()       

    #.txt Scoring Results
    if not(not string):
        with open(str4, 'w') as writeFile:
            for i in np.arange(len(string)):
                writeFile.write("%(t)s\n" % {"t":string[i]})
    writeFile.close()
    
    return True

In [40]:
def SignalGenerator(t):
    """
    returns numpy.ndarray
    
    SignalGenerator(
        t - numpy.ndarray
        )
        
    The file generates a random sine wave combination with white noise
    given an input time series
    """
    #Signal generator for practice
    noise1 = np.random.randn(len(t))                # white noise 1
    noise2 = np.random.randn(len(t))                # white noise 2 
    noise3 = np.random.randn(len(t))                # white noise 3
    phase  = np.random.randn(3)                     #radians
    frequency1 = np.random.randint(1,1000)          #Hz
    frequency2 = np.random.randint(1,1000)          #Hz
    frequency3 = np.random.randint(1,1000)          #Hz
    mag = np.random.randn(3)
    base1 = mag[0] * np.sin(2 * np.pi * frequency1 * t + phase[0] ) + noise1  #base signal
    base2 = mag[1] * np.sin(2 * np.pi * frequency2 * t + phase[1] ) + noise2  #base signal
    base3 = mag[2] * np.sin(2 * np.pi * frequency3 * t + phase[2] ) + noise3  #base signal
    return base1 + base2 + base3

In [41]:
def GenerateTrainingFile(string):
    """
    returns True upon completition
    
    GenerateTrainingFile(
        string - filename of CSV files (include CSV extension)
    )
    
    """
    
    #Hard Coded file and changing directories
    #This is where the data is
    HomeDirectory = "/Users/tbryan/Desktop/9 2019 Fall/ECEN 403/Programming/ProgramsForDemo/Final"
    os.chdir(HomeDirectory)
    os.chdir('Data')
    os.chdir('IMS')
    directory = [sorted(os.listdir('1st_test')),sorted(os.listdir('2nd_test')),sorted(os.listdir('3rd_test/txt'))]

    #directory.remove(".DS_Store")
    os.chdir('1st_test')
    #IMSDictionary = GenerateIMSDictionary('2003.10.22.12.06.24',"DELETE.csv",os.getcwd())
    IMSDictionary = GenerateIMSDictionary('2003.10.22.12.06.24'," ",os.getcwd())

    #Instantiate
    m1 = []
    m2 = []
    m3 = []
    m4 = []
    m1y = []
    m2y = []
    m3y = []
    m4y = []
    
    #For each test 
    for j in range(0,1):
        
        #For each file
        i = 0
        while i < len(directory[j]):
        #while i < 20:
            if directory[j][i] != ".DS_Store":
                
                #Read Data
                data = pd.read_table(directory[j][i],header = None)
                
                #Get each bearing from file
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,0]),IMSDictionary,directory[j][i])
                B1X = getCompleteDataFrame(DF,1)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,1]),IMSDictionary,directory[j][i])
                B1Y = getCompleteDataFrame(DF,1)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,2]),IMSDictionary,directory[j][i])
                B2X = getCompleteDataFrame(DF,2)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,3]),IMSDictionary,directory[j][i])
                B2Y = getCompleteDataFrame(DF,2)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,4]),IMSDictionary,directory[j][i])
                B3X = getCompleteDataFrame(DF,3)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,5]),IMSDictionary,directory[j][i])
                B3Y = getCompleteDataFrame(DF,3)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,6]),IMSDictionary,directory[j][i])
                B4X = getCompleteDataFrame(DF,4)
                DF = ReplaceSignalDataofInterest(np.transpose(data.values[:,7]),IMSDictionary,directory[j][i])
                B4Y = getCompleteDataFrame(DF,4)
                
                #Get Column Title
                ColumnTitle = B1X.columns
                
                #Append arrays
                m1.append(B1X.values[0,:])
                m1y.append(B1Y.values[0,:])
                m2.append(B2X.values[0,:])
                m2y.append(B2Y.values[0,:])
                m3.append(B3X.values[0,:])
                m3y.append(B3Y.values[0,:])
                m4.append(B4X.values[0,:])
                m4y.append(B4Y.values[0,:])
                i += 1
            else:
                i += 1

        #To dataframes
        B1X = pd.DataFrame(m1,columns = ColumnTitle)
        B1Y = pd.DataFrame(m1y,columns = ColumnTitle)
        B2X = pd.DataFrame(m2,columns = ColumnTitle)
        B2Y = pd.DataFrame(m2y,columns = ColumnTitle)
        B3X = pd.DataFrame(m3,columns = ColumnTitle)
        B3Y = pd.DataFrame(m3y,columns = ColumnTitle)
        B4X = pd.DataFrame(m4,columns = ColumnTitle)
        B4Y = pd.DataFrame(m4y,columns = ColumnTitle)
        
        #Combine Dataframes
        TD = B1X
        TD = TD.append(B1Y)
        TD = TD.append(B2X)
        TD = TD.append(B2Y)
        TD = TD.append(B3X)
        TD = TD.append(B3Y)
        TD = TD.append(B4X)
        TD = TD.append(B4Y)

    os.chdir(HomeDirectory)
    
    #To csv
    TD.to_csv(string)

    return True

In [42]:
def GetFinalModelForComparison(MinSampleSplit=2, Nestimators = 10):
    """
    Returns a dictionary of the the final ML classifier with ability to change the 
    min_samples_split and n_estimators setting
    
    GetFinalModelForComparison(MinSampleSplit=2, Nestimators = 10)
    """
    #Arrange
    models = {
        'RandomForestClassifier': RandomForestClassifier(min_samples_split=MinSampleSplit,n_estimators = Nestimators),
        
        }
    return models

In [43]:
def StateDict():
    """
    Returns a dictionary "Key": Value
    
    StateDict()
    
    The key is the more verbose describtion of the output state of the training data
    The value is the int that is defined to relate to the corresponding key. 
    """
    
    State2Int = {
        "Early": 0,
        "Suspect": 1,
        "Normal": 2,
        "Imminent Failure": 3,
        "Inner Race Failure": 4, 
        "Rolling Element Failure": 5,
        "Stage 2 Failure": 6,
        "ERROR": 77777
    }
    
    return State2Int

In [44]:
def GetReducedFeatureArraysFromDataFrame(dataset):
    """
    Returns tuple: ReducedFeatureTrainingData, dataset[['State']]
    ReducedFeatureTrainingData - dictionary of feature extraction dataframes
    dataset[['State']] - corresponding dataframe with outcome states
    
    GetReducedFeatureArraysFromDataFrame(
        dataset - dataframe containing the training data
        )
    """
    
    #Feature Extraction
    ReducedFeatureTrainingData = {
        '2':  dataset[['RMS','FTF']],
        '3':  dataset[['RMS','FTF','Max ABS']],
        '4':  dataset[['RMS','FTF','Max ABS','Skew']],
        '5':  dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5']],
        '6':  dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5']],
        '7':  dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min']],
        '8':  dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min','PSD Frq @ Peak 1']],
        '9':  dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min','PSD Frq @ Peak 1','FFT Frq @ Peak 1']],
        '10': dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min','PSD Frq @ Peak 1','FFT Frq @ Peak 1','PSD Frq @ Peak 2']],
        '3a': dataset[['RMS','Max ABS','Skew']],
        '4a': dataset[['RMS','Max ABS','Skew','Max']],
        '5a': dataset[['RMS','Max ABS','Skew','Max','Min']],
        'all': dataset.drop(columns=['State'])
    }
    
    return ReducedFeatureTrainingData, dataset[['State']]

In [45]:
def GetFinalReducedFeatureArraysFromDataFrame(dataset):
    """
    Returns tuple: ReducedFeatureTrainingData, dataset[['State']]
    ReducedFeatureTrainingData - dictionary of feature extraction dataframes
    dataset[['State']] - corresponding dataframe with outcome states
    
    GetFinalReducedFeatureArraysFromDataFrame(
        dataset - dataframe containing the training data
        )
    """
    
    #Feature Extraction
    #Similar to GetReducedFeatureArraysFromDataFrame() with on the '10' state
    ReducedFeatureTrainingData = {
        '10': dataset[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min','PSD Frq @ Peak 1','FFT Frq @ Peak 1','PSD Frq @ Peak 2']],
    }
    
    return ReducedFeatureTrainingData, dataset[['State']]

In [46]:
def Get10FeaturesFromDataFrame(df):
    """
    Returns a dataframe of the 10 important features
    
    Get10FeaturesFromDataFrame(
        df - dataframe of the getTestDataFrame that contains the data to be analyzed
        )
    """
    #Extract Features
    x = df[['RMS','FTF','Max ABS','Skew','PSD Frq @ Peak 5','Autocorrelate Time @ Peak 5','Min','PSD Frq @ Peak 1','FFT Frq @ Peak 1','PSD Frq @ Peak 2']]
    
    return x

In [47]:
def ReducedFeatureComparison(ReducedFeatureTrainingData,StateTrain):
    """
    Returns tuple: results, header which are strings
    results- contains classification reports
    header- contains formatted strings
    
    ReducedFeatureComparison(
        ReducedFeatureTrainingData - dictionary of dataframes with extracted features
        StateTrain - dataframe of the outcome states associated with ReducedFeatureTrainingData
    )
    """
    
    #Instantiate
    result = []
    header = []
    
    #Set Splitting Features
    validation_size = 0.20
    seed = 6
    
    #For each combination of Feature Extraction Models
    for set in ReducedFeatureTrainingData:
        
        #Get Values 
        FeatureTrain = ReducedFeatureTrainingData[set].values
        
        #Split
        xtrain, xtest, ytrain, ytest = model_selection.train_test_split(FeatureTrain, StateTrain.values[:,0], test_size=validation_size, random_state=seed) 
        
        #Train
        ClassifierReduced = TrainModel(xtrain,ytrain)
        
        #Predict
        ypred,ypredstring = PredictModel(ClassifierReduced,xtest)
        
        #Append strings
        result.append(classification_report(ytest,ypred))
        header.append('Feature Number ({}):'.format(set))
            
    return result,header

In [48]:
def GenerateFeatureFile(string,result,header):
    """
    Returns True upon completition
    
    GenerateFeatureFile(
        string - string with file name (include extension')
        results- contains classification reports
        header - contains formatted strings
    )
    """
    
    #Write to file
    with open(string, 'w') as writeFile:
        for i in np.arange(len(result)):
            writeFile.write("{}\n{}\n".format(header[i],result[i]))
    writeFile.close() 
    
    return True