# Objective: Classify echo curves based on Stencil type

## Loading Data

In [2]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()

from google.colab import drive
drive.mount('/content/drive')

sys.path.append("/content/drive/MyDrive/GSOC-NMR-project/Work/Notebooks")
from auxillary_functions import *
from polynomial_featextract import poly_featextract

Mounted at /content/drive


In [3]:
# import raw data and params.txt file

datadir_path = "/content/drive/MyDrive/GSOC-NMR-project/Work/Data/2021-06-21_classify_datagen_all_funcs"
raw_data = load_data(path=datadir_path,as_df=False)
print("Shape of Raw Data:",raw_data.shape)
params_data = load_params(path=datadir_path)
# Stencil type : {'0' : 'Gaussian', '1' : 'Power Law', '2' : 'RKKY'}

Finished loading data into numpy array. Took 18.93s
Shape of Raw Data: (10500, 942)
Finsihed loading parameters file


In [4]:
### Selecting a time-window of 150 steps around the echo-pulse

offset = 150
shifted_data, center = get_window(raw_data,2/3,width=offset)
print("The Echo pulse occurs at timestep:",center)

# Rescaled data
rscl_data = shifted_data / np.max(shifted_data,axis=1,keepdims=True)

The Echo pulse occurs at timestep: 628


# Machine Learning

Here we use 5-fold cross-validation for each model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.pipeline import Pipeline

# Linear Models

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [6]:
# Tree models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

## Training 

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate

### Linear Models

In [27]:
# Setup Pipeline

def linear_pipeline(model):
    pipe = Pipeline([
            ('scaler', QuantileTransformer(output_distribution='normal')),
            ('model', model)     
    ])
    return pipe

def tree_pipeline(model):
    return Pipeline([('model', model)])

def get_model_statistics(model,X,y, linear=True):
    """For the given model (linear or tree type) computes the 5-fold CV metrics 
    and returns a dataframe object with metric statistics"""
    modelname = str(model).split('(')[0]
    print(f"Running CV for model: {modelname}")
    
    scores = ['accuracy','precision_weighted','recall_weighted','f1_weighted','roc_auc_ovr_weighted']

    if linear == True:
        results_kf = cross_validate(estimator=linear_pipeline(model), 
                                    verbose=True, X=X, y=y,
                                    scoring=scores, cv=5, n_jobs=-1)
        results_kf_df = pd.DataFrame(results_kf)
    
    if linear == False:
        results_kf = cross_validate(estimator=tree_pipeline(model), verbose=True,
                            X=X, y=y, scoring=scores, cv=5, n_jobs=-1)
        results_kf_df = pd.DataFrame(results_kf)


    return {f'{str(modelname)}_Mean': dict(results_kf_df.mean()), 
            f'{str(modelname)}_Std': dict(results_kf_df.std()),
            f'{str(modelname)}_params' : str(model)}

In [28]:
X, y = rscl_data, params_data['stencil_type'].values

In [16]:
# Try out for SVC 

svm = LinearSVC(multi_class='ovr',random_state=0)

In [18]:
res = get_model_statistics(svm, X,y, linear=True)

Running CV for model: LinearSVC


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [19]:
res

{'LinearSVC_Mean': {'fit_time': 27.051352882385252,
  'score_time': 0.628878116607666,
  'test_accuracy': 0.9103809523809524,
  'test_f1_weighted': 0.9101807055519735,
  'test_precision_weighted': 0.9128219897369547,
  'test_recall_weighted': 0.9103809523809524},
 'LinearSVC_Std': {'fit_time': 5.38397175635031,
  'score_time': 0.11962683203742101,
  'test_accuracy': 0.0044974167440448125,
  'test_f1_weighted': 0.004617201117200342,
  'test_precision_weighted': 0.0035288562027393252,
  'test_recall_weighted': 0.0044974167440448125},
 'LinearSVC_params': "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,\n          verbose=0)"}

In [20]:
# Try out ovevsrest SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [22]:
res2 = get_model_statistics(OneVsRestClassifier(SVC()), X,y, linear=True)

Running CV for model: OneVsRestClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min finished


In [23]:
res2

{'OneVsRestClassifier_Mean': {'fit_time': 39.93767776489258,
  'score_time': 7.576337766647339,
  'test_accuracy': 0.9448571428571428,
  'test_f1_weighted': 0.9445842326402925,
  'test_precision_weighted': 0.9501776844508527,
  'test_recall_weighted': 0.9448571428571428},
 'OneVsRestClassifier_Std': {'fit_time': 3.4712215082351285,
  'score_time': 0.5621657518171798,
  'test_accuracy': 0.004754756539568229,
  'test_f1_weighted': 0.0048282386411086856,
  'test_precision_weighted': 0.003926427864875071,
  'test_recall_weighted': 0.004754756539568229},
 'OneVsRestClassifier_params': "OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,\n                                  class_weight=None, coef0=0.0,\n                                  decision_function_shape='ovr', degree=3,\n                                  gamma='scale', kernel='rbf', max_iter=-1,\n                                  probability=False, random_state=None,\n                                  shrinking=T

In [None]:
Linearmodels = [KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
                LogisticRegression(multi_class='ovr',n_jobs=-1, max_iter=800),
                LogisticRegression(multi_class='multinomial',n_jobs=-1, max_iter=800),
                SGDClassifier(loss='modified_huber',shuffle=True, random_state=0), 
                GaussianProcessClassifier(random_state=0, multi_class='ovr',n_jobs=-1),
                GaussianNB(), BernoulliNB()]

In [None]:
LinearResults = []
for model in Linearmodels:
    LinearResults.append(get_model_statistics(model, X,y, linear=True))
    print()

In [None]:
# path = "/content/drive/MyDrive/GSOC-NMR-project/Work/Notebooks"
with open("/content/drive/MyDrive/GSOC-NMR-project/Work/Notebooks/linearmodel_classification_results.txt", 'w') as wf:
    wf.write(str(LinearResults))

### Tree based models

In [24]:
treemodels = [DecisionTreeClassifier(), 
              ExtraTreesClassifier(), 
              #GradientBoostingClassifier(), 
              RandomForestClassifier()]

In [25]:
# Try GradientBoostingClassifier later

In [29]:
get_model_statistics(RandomForestClassifier(), X,y, linear=False)

Running CV for model: RandomForestClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.1s finished


{'RandomForestClassifier_Mean': {'fit_time': 17.00789132118225,
  'score_time': 0.13930120468139648,
  'test_accuracy': 0.9770476190476192,
  'test_f1_weighted': 0.9770637304613707,
  'test_precision_weighted': 0.977498167424104,
  'test_recall_weighted': 0.9770476190476192,
  'test_roc_auc_ovr_weighted': 0.998627006802721},
 'RandomForestClassifier_Std': {'fit_time': 2.6172328494576362,
  'score_time': 0.019708951851013203,
  'test_accuracy': 0.00383030048613144,
  'test_f1_weighted': 0.003828909131523392,
  'test_precision_weighted': 0.0037861202538075527,
  'test_recall_weighted': 0.00383030048613144,
  'test_roc_auc_ovr_weighted': 0.0004363141624558061},
 'RandomForestClassifier_params': "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n                       criterion='gini', max_depth=None, max_features='auto',\n                       max_leaf_nodes=None, max_samples=None,\n                       min_impurity_decrease=0.0, min_impurity_split=None,\n      

In [None]:
TreeResults = []
for model in treemodels:
    TreeResults.append(get_model_statistics(model, X,y, linear=False))
    print()

Running CV for model: DecisionTreeClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



Running CV for model: ExtraTreesClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



Running CV for model: RandomForestClassifier



[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   49.1s finished


In [None]:
with open("/content/drive/MyDrive/GSOC-NMR-project/Work/Notebooks/treemodel_classification_results_ts.txt", 'w') as wf:
    wf.write(str(TreeResults))

## Classification metrics

All metrics are somehow associated and can be derived from the confusion matrix. 

1. Precision
2. Recall
3. F1-score
4. ROC-AUC score (AUROC)
5. Cohen kappa score
6. Matthew's correlaton coefficient
7. Log loss