# BackEnd for Classifiers
Provides generalized methods for training, testing and saving classifiers.


## Setup Environment

In [2]:
# Machine Learning Packages
import sklearn
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn import neighbors, datasets
from sklearn.metrics import classification_report, confusion_matrix

# Utility Packages
import re
import inspect
import collections
import matplotlib.pyplot as plt
import pickle
import os
import joblib
import types

# Data Packages
import pandas as pd
import numpy as np
import pytimber
db = pytimber.LoggingDB()

# Decorative Packages
import ipywidgets as widgets
from ipywidgets import IntProgress
from matplotlib.colors import ListedColormap
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

SyntaxError: positional argument follows keyword argument (<ipython-input-2-2fe9d552a2d4>, line 30)

## DataSet Creation

In [2]:
def lockout():
    resp = input("Are you sure you want to reset? 1: Yes, 0: No")
    while True:
        if re.match("\d+",resp):
            resp = int(resp)
            if resp == 1:
                return 2
            if resp == 0:
                return 0
        print("Please type a number, 0 or 1")
        resp = input("Are you sure you want to reset? 1: Yes, 0: No")

## Generate Data from Catalogue
Use probe ID and Fill Numbers from a CSV-stored catalogue to retrieve Probe Data. <br>
Probe data is cached to the local data sub-directory and is returned as a Pandas DataFrame.

**bin_spec** Is a variable that specifies how many bins to use for generating features:
* A tuple of length 2 e.g. (3,6) specifies the lower bound 3 and upper bound 6 with step 1
* A tuple of length 3 e.g. (2,12,3) specifies the lower bound 2and upper bound 12 with step 3
* An integer specifies to generate only 1 cut

Note for future development: If a new feature set is required, pass a different feature function <br>
E.g. Fourier Coeffecients, Wavelet Transform terms...

In [75]:
def get_catalogue(catalogue_name):
    path = os.path.join(os.getcwd(),"data","catalogue","{}_Catalogue.csv".format(catalogue_name))
    if os.path.isfile(path):
        catalogue = pd.read_csv(path)
    else:
        raise FileNotFoundError("{} does not appear to exist".format(path))
    if not isinstance(catalogue, pd.DataFrame): raise TypeError("Catalogue provided is not a Pandas Dataframe")
    if not all(column in catalogue.columns.values for column in ["Probe ID","Response","Fill"]):
        raise ValueError("Expected to find 'Probe ID','Response' and'Fill' in Catalogue header")
    return catalogue

def process_feature_spec(feature_spec):
    if type(feature_spec) is numpy.ndarray:
        return feature_spec
    if type(feature_spec) is tuple:
        if len(feature_spec) == 2:
            return np.arange(feature_spec[0],feature_spec[1],1)
        elif len(feature_spec) == 3:
            return np.arange(feature_spec[0],feature_spec[1],feature_spec[2])
        else:
            raise ValueError("Too many values to unpack from bins specification: {}".format(feature_spec))
    if type(feature_spec) == int or type(feature_spec) == np.int64:
        return np.atleast_1d(feature_spec) #1D array so that it is Iterable
    else:
        raise ValueError("Bin Specification {} is of type {} but must be a tuple or int".format(feature_spec,type(feature_spec)))
    
def generate_data_from_catalogue(catalogue_name, feature_spec=4, reset=0, feature_generator = bin_data, verbose=0):
    catalogue = get_catalogue(catalogue_name)
    if verbose==1: print("{0}Catalogue {2}{1}: {3}".format(color.BOLD,color.END,catalogue_name,catalogue.shape))
    
    if reset == 1: reset = lockout() # To prevent accidental overwritting
        
    # Binning Setup
    feature_spec = process_feature_spec(feature_spec)
    
    # Ignore UNDETERMINED gauges
    catalogue = catalogue[catalogue.Response != "UNDETERMINED"]
    if verbose == 1: print("Ignoring UNDETERMINED gauges in Catalogue, New Shape: {}".format(catalogue.shape))
    catalogue.reset_index(inplace=True, drop=True) # Start Index from 0
    
    # Display Progress
    progressBar = IntProgress(min=0, max=catalogue.shape[0],description='Progress:',bar_style='info') 
    display(progressBar)
    
    # Output
    # Rows = No. Of Valid Probes, Columns = No.of features
    Xs = [np.zeros((catalogue.shape[0],no_of_features)) for no_of_features in feature_spec]
    y = catalogue.Response.replace(to_replace=['NORMAL','COUPLED'],value=[0,1])
    lookup = pd.concat((catalogue['Probe ID'],catalogue['Fill']),axis=1)

    for index, row in catalogue.iterrows():
        gauge_id, fill_no, response = row['Probe ID'], row['Fill'], row['Response']
        gauge_id = gauge_id.replace(".", "-")
        
        # Setup Paths
        file_name = "Probe_{0}_Fill{1}.p".format(gauge_id,str(fill_no))
        folder = os.path.join(os.getcwd(),"data","probes",str(fill_no),gauge_id) 
        file_path = os.path.join(folder,file_name)
        
        feature_folder = os.path.join(folder,catalogue_name)
        
        # Create Folder if it does not exist already
        if not os.path.exists(feature_folder): os.makedirs(feature_folder)
        
        # If file already exists, load it
        if os.path.isfile(file_path) and reset == 0:
            if verbose == 2: print("Loading existing data for {} in Fill {}".format(gauge_id,fill_no))
            # Safely close and open files using with...
            with open(file_path,"rb") as pgd_file:
                pgd = pickle.load(pgd_file)
                # Only load the Processed Gauge Data (pgd) once, then use it for each feature
                for no_of_features in feature_spec:
                    feature_file_name = "{}_Intervals_Gauge_{}.p".format(no_of_features,gauge_id)
                    feature_path = os.path.join(feature_folder,feature_file_name)
                    if os.path.isfile(feature_path):
                        with open(feature_path,"rb") as feature_file:
                            features = pickle.load(feature_file)
                            Xs[no_of_features-feature_spec[0]][index] = features
                    else:
                        with open(feature_path,"wb") as feature_file:
                            features = feature_generator(pgd.pressure_readings[pgd.mask],no_of_features)
                            Xs[no_of_features-feature_spec[0]][index] = features
                            pickle.dump(features, feature_file)
        else:
            if verbose == 2: print("Saving new data for {} in Fill {}".format(gauge_id,fill_no))
            pgd = processed_gauge_data(row['Probe ID'], fill_no)
            pgd.generate_data() # Retrieve pressure readings, mask etc.
            with open(file_path,"wb") as pgd_file:
                pickle.dump(pgd, pgd_file)
                for no_of_features in feature_spec:
                    try:
                        features = feature_generator(pgd.pressure_readings[pgd.mask],no_of_features)
                    except:
                        if verbose == 2: print("Could not generate data for {0} in fill {1}.".format(gauge_id,fill_no))
                        continue
                    feature_file_name = "{}_Intervals_Gauge_{}.p".format(no_of_features,gauge_id)
                    feature_path = os.path.join(feature_folder,feature_file_name)
                    with open(feature_path, "wb") as feature_file:
                        pickle.dump(features, feature_file)
                        Xs[no_of_features-feature_spec[0]][index] = features
        
        progressBar.value += 1
    if progressBar.value != y.shape[0]:
        print("{} probes failed to load from Catalogue. Use verbose=2 to find them.".format(y.shape[0]-progressBar.value))
        return None, None, None
    else:
        datasets = {} # Map no.of features to dataset dataFrame
        for X in Xs:
            X = pd.DataFrame(X,columns=["Bin"+str(i) for i in range(X.shape[1])])
            dataset = pd.concat((X,y,lookup),axis=1)
            datasets[X.shape[1]] = dataset
            
            folder=os.path.join(os.getcwd(),"data","datasets",catalogue_name)
            dataset_name = "{}_features_{}.csv".format(X.shape[1],catalogue_name)
            dataset_path = os.path.join(folder,dataset_name)
            if not os.path.exists(folder):
                os.makedirs(folder)
                
            if os.path.isfile(dataset_path) and reset == 0:
                if verbose == 1: print("Dataset already exists")
            else:
                dataset.to_csv(dataset_path, index=False, header = True)
                
            if verbose == 2: display(dataset.head())
        return datasets
            
# Close files! CHECK
# Intervals can be a single number of a tuple with a lower and upper limit CHECK
    # if type(intervals) is tuple
# Use name of catalogue as name of dataset handle CHECK

In [81]:
def get_or_create_datasets(catalogue_name, feature_spec=4, reset=0, feature_generator = bin_data, verbose=0):
    catalogue = get_catalogue(catalogue_name)
    if verbose==1: print("{0}Catalogue {2}{1}: {3}".format(color.BOLD,color.END,catalogue_name,catalogue.shape))
        
    if reset == 1: reset = lockout() # To prevent accidental overwritting
        
    # Binning Setup
    feature_spec = process_feature_spec(feature_spec)
    
    datasets = {}
    if reset == 0:
        for no_of_features in feature_spec:
            # Setup Paths
            folder=os.path.join(os.getcwd(),"data","datasets",catalogue_name)
            dataset_name = "{}_features_{}.csv".format(no_of_features,catalogue_name)
            dataset_path = os.path.join(folder,dataset_name)
            if os.path.isfile(dataset_path):
                dataset = pd.read_csv(dataset_path)
                datasets[no_of_features] = dataset
                if verbose == 2: display(dataset.head())
    if len(datasets) == len(feature_spec):
        return datasets
    elif datasets == {}:
        return generate_data_from_catalogue(catalogue_name, feature_spec, reset, feature_generator, verbose)
    else:
        for no_of_features in (no_of_features for no_of_features in feature_spec if no_of_features not in datasets.keys()):
            datasets[no_of_features] = generate_data_from_catalogue(catalogue_name, no_of_features, reset, verbose)[no_of_features]
        return datasets

# Check entire feature_spec Range
# If entire range is free: load full feature_spec as one
# If only a few are free: load individually

# Manual Grid Search
Step-by-step Grid Search used for Diagnostics and conceptual awareness.

In [None]:
def split_dataset(X,y, seed = 0, verbose = 0):
    X_train, X_test, X_train_labels, X_test_labels = train_test_split( X, y,
                                                                      test_size=0.2, random_state=seed)
    if verbose >= 1: print (""">>>> {0}Splitting Dataset{1} <<<<\n{0}Train set:{1} Data {2} Labels {3}
                            {0}Test set:{1}  Data {4} Labels {5}""".format(color.BOLD,color.END,
                                                                        X_train.shape,X_train_labels.shape,
                                                                        X_test.shape,X_test_labels.shape))
    return X_train, X_test, X_train_labels, X_test_labels




In [None]:
def cross_validate_sets(X_train, X_train_labels, verbose = 0,
                        cv_splitter = None, scorer = None, classifier = None, seed = 0):
    if scorer is None: scorer = metrics.make_scorer(pick_tool("Scorer","_score",moduleName="metrics"))
    if isinstance(scorer, types.FunctionType): scorer = metrics.make_scorer(scorer)
    if cv_splitter is None: 
        cv_splitter = pick_tool("CV-Splitter","(Fold|Split|Leave)",moduleName="model_selection")(test_size=0.2,train_size=0.8,random_state=seed)
    else:
        cv_splitter= cv_splitter(test_size=0.2,train_size=0.8,random_state=seed)
    if classifier is None: 
        classifier = pick_tool("Machine Learning Algorithm","Classifier",moduleName=None)
    average_cv_score = np.mean(cross_val_score(classifier(), X_train, X_train_labels, cv=cv_splitter, scoring = scorer))
    print(""">>>> Cross Validation <<<<
    {0}Average CV score{1}: {2}
    {0}Train Set{1}: {3}
    {0}Validation set{1}: {4}""".format(color.BOLD,color.END,average_cv_score,
                                       cv_splitter.train_size * len(X),
                                       cv_splitter.test_size * len(X)))
    if verbose >= 1:
        for training_indices, validation_indices in cv_splitter.split(X,y):
            display(np.sort(training_indices)[0:5])
            display(np.sort(validation_indices)[0:5])
    return cv_splitter, average_cv_score

In [83]:
def find_best_parameter_manualy(X_train, X_train_labels, scorer = None, cv_splitter=None, classifier = None, seed = 0, verbose=1, show_plot=True):
    if scorer is None: scorer = pick_tool("Scorer","_score",moduleName="metrics")
    if cv_splitter is None:
        cv_splitter = pick_tool("CV-Splitter","(Fold|Split|Leave)",moduleName="model_selection")
    cv_splitter = cv_splitter(test_size=0.2,train_size=0.8,random_state=seed)
    if classifier is None: classifier = pick_tool("Machine Learning Algorithm","Classifier",moduleName=None)
        
    # Provide Help using the classifier's DocString
    if verbose >= 1: print(">>>>{0}DOCUMENTATION{1}<<<<\n{2}\n>>>>{0}END DOCUMENTATION{1}<<<<".format(color.BOLD,color.END,re.split("-{8,}",classifier.__doc__)[1]))
    
    # Find and Select 1 parameter to vary from the Classifier
    parameters = inspect.getfullargspec(classifier).args
    parameters.remove("self")
    for i in range(0,len(parameters)): print("{0}: {1}".format(i,parameters[i]))

    resp = input("Select Parameter by typing its associated number >>>")
    while True:
        if re.match("\d+",resp):
            if int(resp) in range(0,len(parameters)):
                param_name = parameters[int(resp)]
                param_range = set_param_limits(param_name,classifier)
                break
        resp = input("Select Parameter by typing its associated number >>>")
    
    
    progessBar = IntProgress(min=0, max=cv_splitter.get_n_splits(X_train)*len(param_range),description='Progress:',bar_style='info') # instantiate the bar
    display(progessBar)
    
    best_score = -1
    scores = np.zeros((len(param_range),cv_splitter.get_n_splits(X)))
    best_param_setting = -1
    
    param_count = 0
    for val in param_range: 
        split_count = 0
        for training_indices, validating_indices in cv_splitter.split(X_train,X_train_labels):
            X_build,X_build_labels = X_train.iloc[training_indices], X_train_labels.iloc[training_indices]
            X_valid, X_valid_labels = X_train.iloc[validating_indices], X_train_labels.iloc[validating_indices]
            model = classifier(**{param_name:val}).fit(X_build,X_build_labels)
            xhat = model.predict(X_valid)
            if verbose >= 2:
                print("""{0}{2} Using {3} {4}{1}
                Train Set {5}: {6}
                Validation Set {5}: {7}
                """.format(color.BOLD,color.END,
                           classifier.__name__,param_name, val,
                           scorer.__name__,
                           scorer(y_true=X_build_labels,y_pred=model.predict(X_build)),
                           scorer(X_valid_labels,xhat)))
            score = scorer(y_true=X_valid_labels,y_pred=xhat)
            scores[param_count,split_count] = score
            split_count += 1
            progessBar.value += 1
       
        if np.mean(scores[param_count])>best_score:
            best_score = np.mean(scores[param_count])
            best_param_setting = val
            
        param_count += 1
            
    mean_score = np.mean(scores,axis=1)

    if verbose >= 1:
        print("""
        {0}Best {2}{1}: {3}
        {0}Best {4}{1}: {5} 
        {0}Mean {2}{1}:""".format(color.BOLD,color.END,
                   scorer.__name__,
                   np.max(mean_score), param_name,
                   best_param_setting))
        display(pd.DataFrame(mean_score,index=param_range,columns=[param_name]).transpose())
    
    if show_plot:
        plt.figure()
        plt.plot(param_range,mean_score,'g')
        plt.ylabel(scorer.__name__)
        plt.xlabel(param_name)
        plt.xticks(param_range)
        plt.xlim((param_range[0],param_range[-1]))
        plt.tight_layout()
        plt.show()
    return best_score, best_param_setting

## Generalized Grid Search
Find the best parameters for maximizing any desired score using Grid Search cross validation. <br>
Can pre-specify Classifier, Scorer, CV_Splitter, Seed and the Parameter Grid but others full <br> user-controlled customization of each of these settings. <br>
Note that suppling Datasets is NOT optional, these are needed to perform the GridSearch.

In [None]:
def pick_tool(toolType,searchTerm,moduleName=None):
    if moduleName is None:
        moduleName = pick_tool("Algorithm Type","(neural_network|neighbors|svm|gaussian_process|tree|ensemble|naive_bayes|discriminant_analysis)","sklearn").__name__
        exec("from sklearn import {0}".format(moduleName.split(".")[1]))
        
    print(">>>> Pick a {} <<<<".format(toolType))
    tool_list = [tool for tool in dir(eval(moduleName)) if re.search(searchTerm,tool)] 
    for index in range(0,len(tool_list),6):
        print(f"".join(f'{pair}\n' for pair in [(i,tool_list[i]) for i in range(index,min(len(tool_list), index+5))]))
    resp = input("Type the Index of the {}>>>".format(toolType))
    while True:
        try:
            match = int(re.match("\d+",resp)[0])
        except TypeError:
            match = -1 #Invalid
        if 0 <= match < len(tool_list):
            print("Selected {} as {}".format(tool_list[match],toolType))
            tool = tool_list[match]
            break
        resp = input("Type the Index of the {}>>>".format(toolType))
    print("{}.{}".format(moduleName,tool))
    return eval("{}.{}".format(moduleName,tool))

def set_param_limits(param_name, classifier):
    while True:
        limit = input("'{}' - Set Limits (press enter to use default) >>>".format(param_name))
        if limit == "":
            return None
        try:
            limit = eval(limit)
            if isinstance(limit, collections.Sequence) and type(limit) is not str:
                for elt in limit: classifier(**{param_name:elt})
            else:
                classifier(**{param_name:limit})
        except TypeError:
            print("'{}' is not of a valid type for '{}'".format(limit,param_name))
            continue
        except (NameError, SyntaxError, ValueError) as e:
            print("{} contains invalid syntax, check carefully".format(limit))
            continue
        if not isinstance(limit, collections.Sequence) and not type(limit) is np.ndarray or type(limit) is str:
            limit = [limit]
        break
    return limit

def find_best_parameters(datasets, param_grid = None, classifier = None, cv_splitter = None, scorer=None, seed = 0, verbose=0):
    if scorer is None: scorer = metrics.make_scorer(pick_tool("Scorer","_score",moduleName="metrics"))
    if isinstance(scorer, types.FunctionType): scorer = metrics.make_scorer(scorer)
    if cv_splitter is None: 
        cv_splitter = pick_tool("CV-Splitter","(Fold|Split|Leave)",moduleName="model_selection")
    cv_splitter = cv_splitter(test_size=0.2,train_size=0.8,random_state=seed)
    if classifier is None: classifier = pick_tool("Machine Learning Algorithm","Classifier",moduleName=None)
    best_params, best_no_of_features, best_score = None, -1, -1
    
    # Provide Help using the classifier's DocString
    print(">>>>{0}DOCUMENTATION{1}<<<<\n{2}\n>>>>{0}END DOCUMENTATION{1}<<<<".format(color.BOLD,color.END,re.split("-{8,}",classifier.__doc__)[1]))
    
    if param_grid is None:
        parameters = inspect.getfullargspec(classifier).args
        parameters.remove("self")
        param_grid = {param_name: set_param_limits(param_name,classifier) for param_name in parameters}
    filtered_param_grid = {k: v for k, v in param_grid.items() if v is not None}
        
    for no_of_features, dataset in datasets.items():
        X,y,lookup = dataset.iloc[:,0:dataset.shape[1]-3],dataset.Response,dataset.iloc[:,-2:]
        X_train, X_test, X_train_labels, X_test_labels = split_dataset(X,y, seed = seed)
        generated_models = GridSearchCV(classifier(), filtered_param_grid, scoring=scorer, cv=cv_splitter, verbose=verbose)
        generated_models.fit(X_train,X_train_labels)
        if verbose >= 2: print("Best Parameters for {} features: \n {}".format(no_of_features,generated_models.best_params_))
        if (generated_models.best_score_ > best_score):
            best_score = generated_models.best_score_
            best_params = generated_models.best_params_
            best_no_of_features = no_of_features
    return best_no_of_features, best_params, best_score


# Model Evaluation
Use unseen Testing Set to evaluate the performance of a trained model.

In [None]:
def model_evaluation(datasets, testing_dataset = None, classifier = None, no_of_features = None, best_params = None, scorer = None, seed = 0, verbose = 0):
    if scorer is None: scorer = pick_tool("Scorer","_score",moduleName="metrics")
        
    if classifier is None: classifier = pick_tool("Machine Learning Algorithm","Classifier",moduleName=None)
        
    if no_of_features is None or best_params is None: # Did not yet optimize the model
        no_of_features, best_params, best_score = find_best_parameters(datasets,classifier=classifier,
                                                                            cv_spliter=None,scorer=scorer,
                                                                            seed = seed, verbose = verbose)
    
    dataset = datasets[no_of_features - min(datasets.keys())]
    X,y,lookup = dataset.iloc[:,0:dataset.shape[1]-3],dataset.Response,dataset.iloc[:,-2:]

    if testing_dataset is None:
        X_train, X_test, X_train_labels, X_test_labels = split_dataset(X,y, seed = seed)
        test_lookup = lookup.iloc[X_test_labels.index]
    else:
        X_train, X_train_labels = X,y
        
        X_test,X_test_labels,test_lookup = testing_dataset.iloc[:,0:testing_dataset.shape[1]-3],testing_dataset.Response,testing_dataset.iloc[:,-2:]

    if verbose >= 1: print("X:{} y:{}".format(X.shape,y.shape))
    classifier = classifier(**best_params)
    if verbose >= 2: print("Optimized with {} features and parameters:\n{}".format(no_of_features,best_params))
    model = classifier.fit(X_train,X_train_labels)
    yhat = model.predict(X_test)
    xhat = model.predict(X_train)
    print("{0}Train Set {2} {1}: {3}".format(color.BOLD,color.END,scorer.__name__,scorer(y_true=X_train_labels,y_pred=xhat)))
    print("{0}Test Set {2} {1}: {3}".format(color.BOLD,color.END,scorer.__name__,scorer(y_true=X_test_labels,y_pred=yhat)))
    if verbose >= 1: print(">>>>{} Confusion Matrix {}<<<<\nTrue Positive,False Negative\nFalse Positive,True Negative".format(color.BOLD,color.END))
    cnf_matrix = get_confusion_matrix(X_test_labels,yhat)
    print("{}Classification Report{}".format(color.BOLD,color.END))
    print(classification_report(X_test_labels, yhat))
          
    return model

def get_confusion_matrix(y_test_labels,yhat,normal=False):
    cnf_matrix = confusion_matrix(y_test_labels, yhat)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=["Normal","Anomalous"], normalize= normal,  title='Confusion matrix')
    
    return cnf_matrix

    

# Model Handling
## Saving Model to File
Preserve a trained model for future use

In [1]:
def save_model(model, best_no_features):
    print("{}Model Desc:{}\n {}".format(color.BOLD,color.END,model))
    print("Please enter a name for your model\n%sNote%s: the no.of.features will be appended)"%(color.UNDERLINE,color.END))
    folder = os.path.join(os.getcwd(),"data","models",type(model).__name__)
    if not os.path.exists(folder):
        os.makedirs(folder)

    while True:
        resp = input(">>>")
        try:
            saved_model = joblib.dump(model, os.path.join(folder,str(resp)+"_"+str(best_no_features)+'.joblib'))
            break
        except:
            continue
    print("Successfully saved model to {}".format(str(resp)+"_"+str(best_no_features)+'.joblib'))

## Load Model from File
Load a previously saved model to use for classification

In [None]:
def load_model():
    model_directory = os.path.join(os.getcwd(),"data","models")
    
    valid_files = {}
    count = 0
    for root, dirs, files in os.walk(model_directory, topdown=True):
        for i in range(0,len(files)):
            print("{}: {}".format(count,files[i]))
            valid_files[count] = (root,files[i])
            count += 1
            
    #print("{}: {}".format(i,valid_files[i]) for i in range(0,len(valid_files)))
    
    while True:
        resp = input("Type the number associated with the model you would like to load")
        if re.match("\d+",resp):
            if int(resp) in valid_files.keys():
                file_path = os.path.join(*valid_files[int(resp)])
                break
        print("Please select one of the listed file names by number")
        
    model = joblib.load(file_path)
    print(">>>{}Model Loaded{}<<<\n {}".format(color.BOLD,color.END,model))
    return model
