# Vacuum Gauge Classifier
Advanced tool for classifying vacuum gauges 

* [Setup](#setup)
    * [Environment](#env)
    * [Data Retrieval](#retrieval)
    * [Data Normalization](#norm)
    * [Data Masking](#masking)
* [Dataset Creation](#create)
    * [Level Data](#level)
    * [Plot Levels](#plotlevels)
    * [Generate Dataset](#gen)
    * [Convert Dataset](#convert)
* [K-Neighbours Classifiaton](#kneighbours)
    * [Holdout](#holdout)
    * [Parameter Optimization](#crossvalidate)
    * [Final Evaluation](#eva)
    * [Exhaustive Evaluation](#exhaust)
* [Use Model](#use)
    * [Save Model](#save)
    * [Load Model](#load)

# <a id='setup'> Setup </a>
## <a id='env'>Environment </a>

In [None]:
from ipywidgets import IntProgress
import pandas as pd
import numpy as np
import pytimber
import matplotlib.pyplot as plt
import pickle
import os
db = pytimber.LoggingDB()

from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.metrics import classification_report, confusion_matrix
import csv


# <a id='create'> Dataset Creation </a>

In [None]:
def lockout():
    resp = input("Are you sure you want to reset? 1: Yes, 0: No")
    while True:
        try:
            resp = int(resp)
        except:
            print("Please type a number, 0 or 1")
            resp = input("Are you sure you want to reset? 1: Yes, 0: No")
            continue
        if resp == 1:
            return 2
        elif resp == 0:
            return 0
        else:
            print("Please type a number, 0 or 1")
            resp = input("Are you sure you want to reset? 1: Yes, 0: No")

## <a id='gen'> Generate Dataset </a>
Use CSV file containing probe/gauge information and their ground truth to generate a 2D array containing the pressure levels (features)

In [None]:
def setup_data(df=pd.read_csv(r'CompleteProbeCatalogue2.csv'), variant="default", limit=None, cuts = 9, verbose=False, showPlot=False, reset = 0):
    if reset == 1: reset = lockout()
    
    if verbose:
        print("Shape of Data: (%d,%d)" % df.shape)
    data = []
    count = 0
    limit = df.shape[0] if limit is None else limit 
    
    progressBar = IntProgress(min=0, max=limit,description='Progress:',bar_style='info') # instantiate the bar
    display(progressBar)
    
    for index, row in df.iterrows():
        gauge_id = row['Probe ID']
        if row['Response'] == "UNDETERINED":
            print("Skipping UNDETERMINED gauge {}".format(gauge_id))
            continue
        if (count > limit):
            break
            
        file_name = "probe_%s_fill%d.p"%(gauge_id,row['Fill'])
        press_file_name = "pressure_levels_for_fill_%d_w_%d_cuts.p"%(row['Fill'],cuts)
        variant_folder = os.path.join(os.getcwd(),'data','probes',str(row['Fill']),gauge_id,variant)
        main_folder = os.path.join(os.getcwd(),'data','probes',str(row['Fill']),gauge_id)
        if not os.path.exists(variant_folder):
            os.makedirs(variant_folder)
        if os.path.isfile(os.path.join(main_folder,file_name)) and reset == 0:
            if verbose:
                print("Loading existing data for {} in Fill {}".format(gauge_id,row['Fill']))
            pgd = pickle.load(open(os.path.join(main_folder,file_name),"rb"))
            if os.path.isfile(os.path.join(variant_folder,press_file_name)):
                pressure_levels = pickle.load(open(os.path.join(variant_folder,press_file_name),"rb"))
            else:
                pressure_levels = level_data(pgd.pressure_readings[pgd.mask],cuts)
                pickle.dump(pressure_levels, open( os.path.join(variant_folder,press_file_name),"wb"))
        else:
            if verbose:
                print("Saving data for %s"%gauge_id)
            pgd = processed_gauge_data(gauge_id, row['Fill'])
            try:
                pressure_levels = pgd.generate_data(cuts=cuts)
            except RuntimeError:
                print("Could not generate data for {} in fill {}".format(gauge_id,fill))
                continue
            pickle.dump( pgd, open( os.path.join(main_folder,file_name), "wb" ))
            pickle.dump(pressure_levels, open( os.path.join(variant_folder,press_file_name), "wb"))

        data.append(pressure_levels)

        if showPlot:
            pgd.plot_data()
        if verbose:
            print("Probe %s label:%s"%(gauge_id,row['Steepness']))
            print("%sClassificaton:%s %s \n"%(color.BOLD,color.END,row['Response']))
        count += 1
        progressBar.value += 1
        
    return data

## <a id='convert'> Convert to SciKit Learn Dataset </a>

In [1]:
def get_or_create_gauge_dataset(X = None, cuts = None, df = pd.read_csv(r'CompleteProbeCatalogue2.csv'), reset = 0, variant="default", file_name="ProbeDataSet.csv"):
    if reset == 1: reset = lockout()
    
    if cuts is not None:
        folder=os.path.join(os.getcwd(),"data","datasets","cuts"+str(cuts),variant)
        file_name = file_name.replace(".", "_%d_cuts."%cuts)
    else:
        folder=os.path.join(os.getcwd())
    
    if not os.path.exists(folder):
        os.makedirs(folder)
    

    print("%sAccessing:%s %s\n"% (color.BOLD,color.END, os.path.join(folder,file_name))) 
    
    # Ensure Indices align by resetting
    df.reset_index(inplace=True, drop=True)
    
    # Get uncategorisable gauges 
    bad_indices = df.index[df['Response'].str.contains("UNDETERMINED")].tolist() 
    # Get Duplicates
    duplicate_indices = np.nonzero(df.duplicated(subset=["Probe ID","Fill"],keep='first'))[0]
    
    # Remove uncategorisable & duplicate gauges
    droppable_indices = np.unique(np.concatenate((bad_indices,duplicate_indices),axis=0))         
    df = df.drop(droppable_indices)
    y = df.pop('Response').replace(to_replace=['NORMAL','COUPLED'],value=[0,1])
    if os.path.isfile(os.path.join(folder,file_name)) and reset==0:
        if cuts is not None:
            files = os.listdir(folder)             
            for i in range(0,len(files)):
                print("%s%d: %s%s" %(color.BOLD,i,color.END,files[i]))
            print("Type number of desired dataset to load it")
            while True:
                resp = input(">>>")
                try:
                    file_name = files[int(resp)]
                except:
                    continue
                break
        print("Loading Dataset from Memory...")
        X = pd.read_csv(os.path.join(folder,file_name))
        y = X.pop('Response')
    else:
        print("Loading Dataset from passed Variables...")
    
        X = pd.DataFrame(np.array(X), columns = ['Interval '+str(i) for i in range(len(X[0]))])
        print("Loaded X to shape {} for y of shape {}".format(X.shape,y.shape))
        if X.shape[0] > y.shape[0]:
            X = X.drop(droppable_indices)
        print("Changed X to shape {} for y of shape {}".format(X.shape,y.shape))
        dataset = pd.concat((X,y),axis=1)
        dataset.to_csv(os.path.join(folder,file_name), index=False, header = True)
    
    return X,y.astype(int), pd.concat((df['Probe ID'],df['Fill']),axis=1)

NameError: name 'pd' is not defined

## Generate Multiple Datasets

In [1]:
def create_datasets(df=pd.read_csv("CompleteProbeCatalogue2.csv"), min_cuts = 1, max_cuts = 8, min_fill = None, max_fill = None, reset = 0, variant = "default", verbose=False):
    if reset == 1: reset = lockout()
    
    # Remove Duplicates
    df.drop_duplicates(subset=["Probe ID","Fill"],inplace=True)
    df.sort_values(axis=0,by="Fill",inplace=True)
    if min_fill is not None and max_fill is not None:
        df = df.loc[df.Fill >= min_fill].loc[df.Fill <= max_fill]
        
    datasets = []
    for cut in range(min_cuts,max_cuts):
        folder=os.path.join(os.getcwd(),"data","datasets","cuts"+str(cut),variant)
        if not os.path.exists(folder):
            os.makedirs(folder)
        file_name = "ProbeDataSet_%d_cuts.csv"%(cut)
        if os.path.isfile(os.path.join(folder,file_name)) and reset == 0:
            print("%s already exists"%file_name)
            X,y,lookup = get_or_create_gauge_dataset(df = df,
                                                     file_name=os.path.join("data","datasets","cuts"+str(cut),variant,file_name),
                                                     reset=reset)
            datasets.append((X,y,lookup))
        else:
            data = setup_data(df=df, limit=None, variant=variant, cuts = cut, verbose=verbose, reset = reset)
            X,y,lookup = get_or_create_gauge_dataset(X=data, df = df,
                                                     file_name=os.path.join("data","datasets","cuts"+str(cut),variant,file_name),
                                                     reset=reset)
            datasets.append((X,y,lookup))
    return datasets

NameError: name 'pd' is not defined

# <a id='kneighbours'> K-Nearest-Neighbours Classification </a>
## <a id='holdout'> Setup Training and Testing sets </a>
Split the data into a test and training set using a random seed. 

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

def prepare_sets(X,y, seed = 3):
    # Optional: Normalize to mean = 0 var = 1?

    X_train, X_test, X_train_labels, X_test_labels = train_test_split( X, y,
                                                                       test_size=0.2, random_state=seed)
    print ('%sTrain set:%s Data %s Labels %s'% (color.BOLD,color.END, X_train.shape,  X_train_labels.shape))
    print ('%sTest set:%s  Data %s Labels %s'% (color.BOLD,color.END, X_test.shape,  X_test_labels.shape))
    return X_train, X_test, X_train_labels, X_test_labels

### <a id='crossvalidate'> Grid Search Cross-Validation </a>
Split dataset differently over multiple iterations to improve generalization. <br>
Test over range of k-values using parameter_grid. <br>
Uses full features (X) and targets (y) since the cross-validation splits the dataset 5 times. <br>
<a href="https://scikit-learn.org/stable/modules/cross_validation.html">SciKit Learn Documentation: Cross Validation<a> <br>
<img src="https://scikit-learn.org/stable/_images/grid_search_workflow.png" alt="Grid Search Workflow" width="400" align="left"> </img>

Here we have used a Stratified K fold, which preserves the original distribution of the dataset (about 50% of each class) in both the training_set and testing_set.<br>
Since the dataset is split in 5, we have 20% of the data put aside for testing and 80% for training. Hence we need 5 passes to test all possibilities. <br>
<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_0071.png" alt="Stratified KFold" width="600" align="left"></img>

In [None]:
def cross_validate_sets(X, y, verbose = True,
                        cv_splitter = StratifiedKFold(n_splits=5,random_state=0)):
    classifier = KNeighborsClassifier(n_neighbors=5, weights='distance')
    
    cv_splitter = StratifiedShuffleSplit(n_splits=10,test_size=0.2,train_size=0.8,random_state=0)
    average_cv_score = np.mean(cross_val_score(classifier, X, y, cv=cv_splitter))
    print("Average CV score: %.3f"%average_cv_score)
    print ('%sTrain set:%s %d'% (color.BOLD,color.END, cv_splitter.train_size * len(X)))
    print ('%sValidation set:%s %d'% (color.BOLD,color.END, cv_splitter.test_size * len(X)))
    if verbose:
        for training_indices, validation_indices in cv_splitter.split(X,y):
            display(np.sort(training_indices)[0:5])
            display(np.sort(validation_indices)[0:5])
    return cv_splitter, average_cv_score

In [None]:
def find_optimal_k(cv_splitter,X,y,verbose=1,showPlot=True):
    Ks = 20
    acc = np.zeros((Ks,cv_splitter.get_n_splits(X)))
    std = np.zeros((Ks,cv_splitter.get_n_splits(X)))
    best_score, best_k = 0, None

    progessBar = IntProgress(min=0, max=cv_splitter.get_n_splits(X)*Ks,description='Progress:',bar_style='info') # instantiate the bar
    display(progessBar)

    for k in range(0,Ks):
        split_count = 0
        scores = np.zeros(cv_splitter.n_splits)
        for training_indices, testing_indices in cv_splitter.split(X,y):
            X_train,X_train_labels = X.iloc[training_indices], y.iloc[training_indices]
            X_test, X_test_labels = X.iloc[testing_indices], y.iloc[testing_indices]
            model = KNeighborsClassifier(n_neighbors=k+1).fit(X_train,X_train_labels)
            X_test_predictions = model.predict(X_test)
            if verbose >= 2:
                print("Model using k:%d"%k)
                print("Train set Accuracy: ", metrics.accuracy_score(X_train_labels, model.predict(X_train)))
                print("Test set Accuracy: ", metrics.accuracy_score(X_test_labels, X_test_predictions))
            score = metrics.accuracy_score(X_test_labels, X_test_predictions)
            scores[split_count] = score
            std_dev = np.std(X_test_predictions==X_test_labels)/np.sqrt(X_test_predictions.shape[0])
            acc[k,split_count] = score
            std[k,split_count] = std_dev
            split_count += 1
            progessBar.value += 1
        if np.mean(scores) > best_score:
            best_score = np.mean(scores)
            best_k = k+1
    mean_acc = np.mean(acc, axis=1)
    mean_std = np.mean(std, axis=1)
    best_neighbors = np.argmax(mean_acc)+1
    if verbose >= 1:
        print("%sMean Accuracies:%s"% (color.BOLD,color.END))
        display(pd.DataFrame(mean_acc).transpose())
        print("\n%sBest Accuracy:%s %.1f%%" % (color.BOLD,color.END, np.max(mean_acc)*100))
        
        print("\n%sBest K-value:%s %d"% (color.BOLD,color.END, best_neighbors))
    
    if showPlot:
        plt.figure()
        plt.plot(range(1,Ks+1),mean_acc,'g')
        plt.fill_between(range(1,Ks+1),mean_acc - 1 * mean_std,mean_acc + 1 * mean_std, alpha=0.10)
        plt.legend(('Accuracy ', '$\pm$3$\sigma$'))
        plt.ylabel('Accuracy ')
        plt.xlabel('K-Value')
        plt.xticks([i for i in range(1,Ks+1)])
        plt.xlim((1,Ks+1))
        plt.tight_layout()
        plt.show()
    return best_score, best_neighbors

## Model Generator
Uses grid search (implemented as above) to find the optimal parameters. <br>
This cell is essetially a compact form of the previous section.

In [None]:
def model_generator(X,y,cv_splitter, scorer=metrics.make_scorer(metrics.accuracy_score), verbose = 1):
    classifier = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1, 20)}
    models = GridSearchCV(classifier, param_grid, cv=cv_splitter, iid = True, verbose=verbose, scoring = scorer) 
    models.fit(X,y)
    print("%sOptimal neighbours:%s %d\n%sScore:%s %.1f%%"%(color.BOLD,color.END, models.best_params_['n_neighbors'],color.BOLD,color.END,models.best_score_*100))
    return models.best_params_['n_neighbors']

# Optimal Intervals

In [None]:
def find_optimal_cuts(datasets):
    best_score = -1.0
    best_neighbors = -1
    best_cut = -1
    best_average = (-1,-1,-1)
    for dataset in datasets:
        X,y,lookup = dataset
        X_train, X_test, X_train_labels, X_test_labels = prepare_sets(X,y, seed = 12)
        cuts = X.shape[1]
        print("\n%sNumber of Cuts: %d %s"%(color.BOLD,cuts,color.END))
        cv_splitter, average_cv_score = cross_validate_sets(X_train, X_train_labels, verbose = False)
        score, neighbours = find_optimal_k(cv_splitter,X_train,X_train_labels,verbose=1,showPlot=False)
        if score > best_score and neighbours > 1:
            best_score = score
            best_cut = cuts
            best_neighbors = neighbours
        if average_cv_score > best_average[2]:
            best_average = (cuts,neighbours,average_cv_score)
    print("%sBest Performance%s for %d cuts and K = %d with %.1f%% accuracy"%(color.BOLD,color.END,best_cut,best_neighbors,best_score*100))
    print("%sBest Average%s for %d cuts and K = %d with %.1f%% accuracy"%(color.BOLD,color.END,best_average[0],best_average[1],best_average[2]*100))
    return best_cut, best_neighbors

## <a id='eva'>Final Evaluaton </a>
Uses seeded training_set (not cross validated!) for a final evaluation of the model, including the standard deviation.

In [None]:
def get_confusion_matrix(y_test,yhat,labels=None):# Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, yhat, labels=labels)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=['Unexpected','Expected'],normalize= False,  title='Confusion matrix')
    
    return cnf_matrix

def classic_eval(lookup,y,X_test,X_test_labels,variant, verbose=False):
    print(X_test)
    print(X_test_labels)
    test_indices = X_test.index
    test_set = lookup.iloc[test_indices]
    pos = 0
    curve_correct, gradient_correct = 0, 0
    for index,row in test_set.iterrows():
        file_name = "probe_%s_fill%d.p"%(row['Probe ID'],row['Fill'])
        folder = os.path.join(os.getcwd(),'data','probes',row['Probe ID'],variant)
        if not os.path.exists(folder):
            os.makedirs(folder)
        if verbose:
            print(os.path.join(folder,file_name))
        if os.path.isfile(os.path.join(folder,file_name)):
            if verbose:
                print("Loading existing data for %s"%row['Probe ID'])
            pgd = pickle.load(open(os.path.join(folder,file_name),"rb"))
        else:
            if verbose:
                print("Saving data for %s"%gauge_id)
            pgd = processed_gauge_data(row['Probe ID'], int(row['Fill']))
            try:
                pgd.generate_data(cuts=cuts,get_levels=False)
            except RuntimeError:
                print("Could not generate data for {} in fill {}".format(gauge_id,fill))
                continue
            pickle.dump( pgd, open( os.path.join(folder,file_name), "wb" ))

        # Forward Fourier Transform
        pressure_transform, spectrum, deltaT = forward_fourier_transform(pgd.time_readings,pgd.pressure_readings)
        # Constrained Inverse Fourier Transform
        time_constrained, signal_constrained = filtered_inverse_fourier_transform(pressure_transform,deltaT,spectrum[0],40)
        # Curve Fitting
        try:
            fit1,fit2, coupled = fit_curves(time_constrained,signal_constrained,pgd.mask,verbose=False)
        except UnboundLocalError:
            print("Could not fit curves for {} in fill {}".format(row['Probe ID'],row['Fill']))
            pos+=1
            continue
        
        f,ax,coupled2 = VG_analyzer_simple(row['Probe ID'],row['Fill'],plot=False)
        
        if verbose:
            print("Ground Truth: %d (test_labels) for index %d"%(X_test_labels.loc[index],index)) #y.iloc[index]
            print("Curve Fit Prediction: %s"%coupled)
            print("Gradient Prediction: %s\n"%coupled2)
#         print(len(y))
#         print(index)
        if (coupled == X_test_labels.loc[index]):
            curve_correct +=1
        if (coupled2 == X_test_labels.loc[index]):
            gradient_correct +=1
        pos+=1
    
    print("Curve Fit Accuracy %.1f"%(100*(float(curve_correct)/len(test_set))))
    print("Curve Correct %d out of %d"%(curve_correct,len(test_set)))
    
    print("Gradient Accuracy %.1f"%(100*(float(gradient_correct)/len(test_set))))
    print("Gradient Correct %d out of %d"%(gradient_correct,len(test_set)))
    
def final_eval(datasets,best_cut,best_neighbors, eval_dataset = None, variant="default", verbose=False, seed = 12, evaluate_classic=False):
    X,y,lookup= datasets[(datasets[0][0].shape[1])+best_cut-2]
    if eval_dataset is None:
        X_train, X_test, X_train_labels, X_test_labels = prepare_sets(X,y, seed = seed)
    else:
        X_train, X_test, X_train_labels, X_test_labels = prepare_sets(X,y, seed = seed)
        
        eval_X,eval_X_labels,eval_lookup = eval_dataset
        X_test = eval_X
        X_test_labels = eval_X_labels
        X_train = X
        X_train_labels = y
        
    print("Y:{} X:{}".format(len(y),len(X_test)))
    classifier = KNeighborsClassifier(n_neighbors=best_neighbors)
    print("%sNeighbors:%s %d" % (color.BOLD,color.END,best_neighbors))
    print("%sCuts:%s %d" % (color.BOLD,color.END,X.shape[1]))
    model = classifier.fit(X_train,X_train_labels)
    yhat = model.predict(X_test)
    xhat = model.predict(X_train)
    training_std_dev = np.std(xhat==X_train_labels)/np.sqrt(xhat.shape[0]) # std error appropriate?
    testing_std_dev = np.std(yhat==X_test_labels)/np.sqrt(yhat.shape[0]) # std error
    print("%sTrain set Accuracy:%s %.3f \xc2\xb1 %.3f" %(color.BOLD, color.END, metrics.accuracy_score(X_train_labels, xhat),training_std_dev))
    print("%sTest set Accuracy:%s %.3f \xc2\xb1 %.3f\n" %(color.BOLD, color.END, metrics.accuracy_score(X_test_labels, yhat),testing_std_dev))
    cnf_matrix = get_confusion_matrix(X_test_labels,yhat)
    print("{}Classification Report{}".format(color.BOLD,color.END))
    print(classification_report(X_test_labels, yhat))
    
    if evaluate_classic:
        classic_eval(lookup,y,X_test, X_test_labels, variant, verbose=verbose)
    return model, X_test.index

# <a id='use'>Use Model</a>
## <a id='save'>Store Model</a>

In [None]:
def save_model(model):
    print("{}Model Desc:{}\n {}".format(color.BOLD,color.END,model))
    print("Please enter a name for your model\n%sNote%s: no. of cuts will be appended)"%(color.UNDERLINE,color.END))
    folder = os.path.join(os.getcwd(),"data","models","k_neighbours")
    if not os.path.exists(folder):
        os.makedirs(folder)

    while True:
        resp = input(">>>")
        try:
            saved_model = dump(model, os.path.join(folder,str(resp)+"_"+str(best_cut)+'.joblib'))
            break
        except:
            continue
    print("Successfully saved model to {}".format(str(resp)+"_"+str(best_cut)+'.joblib'))

## <a id='load'>Load Model</a>

In [None]:
def pick_model():
    folder = os.path.join(os.getcwd(),"data","models","k_neighbours")
    files = os.listdir(folder)
    for i in range(0,len(files)):
        print("%s%d: %s%s" %(color.BOLD,i,color.END,files[i]))
    print("Type number of desired model to load it")
    while True:
        resp = input(">>>")
        try:
            file_name = files[int(resp)]
        except:
            continue
        model = load(os.path.join(folder,file_name))
        print("Model Loaded with %d neighbours"%model.n_neighbors)
        break
    return model, file_name