In [142]:
#Import Packages as needed
from dataclasses import dataclass
import os
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from keras_nlp.layers import SinePositionEncoding
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

**LOAD DATA**

In [63]:
df = pd.read_json("ipt_12.json")


**DATA EXPLORATION**


In [64]:
print(df.shape)
df.info()

(167, 13)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 167 entries, 0 to 166
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   case         167 non-null    int64  
 1   run          167 non-null    int64  
 2   VB           146 non-null    float64
 3   DOC          167 non-null    float64
 4   time         167 non-null    int64  
 5   material     167 non-null    int64  
 6   feed         167 non-null    float64
 7   smcAC        167 non-null    object 
 8   smcDC        167 non-null    object 
 9   vib_table    167 non-null    object 
 10  vib_spindle  167 non-null    object 
 11  AE_table     167 non-null    object 
 12  AE_spindle   167 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 18.3+ KB


In [65]:
np.unique(df.material)  #The material variable only takes two variables

array([1, 2])

In [66]:
#Analysis of the output target variable 

target = df.VB
print("target min = ",min(target))
print("target max = ",max(target))
print("target mean = ",np.mean(target))
print("target variance = ",np.var(target))

target min =  0.0
target max =  1.53
target mean =  0.33760273972602733
target variance =  0.06741000656783637


**Observation**: Its observed that VB and run are highly correlated . Time and run is also highly correlated, hence dimensionality reduction could be applied.

**DATA PREPROCESSING**

In [68]:
# Remove samples which has NAN values 
df = df.dropna()
df = df.reset_index()
print(df.shape)

(146, 14)


**PCA ANALYSIS**

In [35]:
df.columns

Index(['index', 'case', 'run', 'VB', 'DOC', 'time', 'material', 'feed',
       'smcAC', 'smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle'],
      dtype='object')

In [90]:
## extraction of dataframe containing features(non time series)

df_nts = df[['case','run','DOC','time', 'material','feed']]

##Spearman Correlation Analysis on the features which are not time series

print("Correlation Matrix\n")
print(df_nts.corr(method = "spearman"))

## Data Standardisation to get data with mean 0 and variance of 1
df_nts_std = StandardScaler().fit_transform(df_nts)

## PCA Analysis

pca = PCA(n_components=5)

principalcomponents = pca.fit_transform(df_nts_std)

principalDf = pd.DataFrame(data = principalcomponents, columns = ['pc1','pc2','pc3','pc4', 'pc5'] )

final_PCA_Df = pd.concat([principalDf , df[['VB']]] , axis =1)


print("\n Final Data Frame after PCA Anaysis \n\n" , final_PCA_Df)


Correlation Matrix

              case       run       DOC      time  material      feed
case      1.000000  0.001273 -0.190007 -0.097283  0.480170 -0.220376
run       0.001273  1.000000 -0.242757  0.945198 -0.250403 -0.053238
DOC      -0.190007 -0.242757  1.000000 -0.217484 -0.062652  0.152552
time     -0.097283  0.945198 -0.217484  1.000000 -0.434648 -0.089983
material  0.480170 -0.250403 -0.062652 -0.434648  1.000000 -0.039640
feed     -0.220376 -0.053238  0.152552 -0.089983 -0.039640  1.000000

 Final Data Frame after PCA Anaysis 

           pc1       pc2       pc3       pc4       pc5    VB
0   -1.477380  2.514569 -0.124263  0.184014 -0.028704  0.00
1   -0.968396  2.427279 -0.006992 -0.036247 -0.186995  0.11
2   -0.500965  2.359379  0.085010 -0.223683 -0.310427  0.20
3   -0.267249  2.325429  0.131011 -0.317401 -0.372143  0.24
4   -0.060985  2.293559  0.174050 -0.402420 -0.430022  0.29
..        ...       ...       ...       ...       ...   ...
141 -1.398645 -1.428144 -0.681897 -1.

**DATA PREPARATION**


In [89]:
df_ts = df[['smcAC',
       'smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle']]

In [92]:
df_update = pd.concat([final_PCA_Df,df_ts ], axis=1)

display(df_update)


Unnamed: 0,pc1,pc2,pc3,pc4,pc5,VB,smcAC,smcDC,vib_table,vib_spindle,AE_table,AE_spindle
0,-1.477380,2.514569,-0.124263,0.184014,-0.028704,0.00,"[-0.0170898438, 0.263671875, 0.2075195312, 0.3...","[0.625, 0.810546875, 0.7812500000000001, 0.849...","[0.078125, 0.0854492188, 0.078125, 0.073242187...","[0.3149414062, 0.3015136719, 0.3039550781, 0.3...","[0.0872802734, 0.0982666016, 0.0921630859, 0.0...","[0.1037597656, 0.1232910156, 0.1049804688, 0.1..."
1,-0.968396,2.427279,-0.006992,-0.036247,-0.186995,0.11,"[0.1123046875, 0.009765625, -0.1513671875, -0....","[0.1318359375, 0.3955078125, 0.7568359375, 0.8...","[0.0830078125, 0.0756835938, 0.0659179688, 0.0...","[0.3161621094, 0.3112792969, 0.302734375, 0.32...","[0.1129150391, 0.0994873047, 0.1049804688, 0.1...","[0.1397705078, 0.12145996090000001, 0.12451171..."
2,-0.500965,2.359379,0.085010,-0.223683,-0.310427,0.20,"[0.3295898438, 0.3076171875, 0.2709960938, 0.2...","[0.68359375, 0.87890625, 0.83984375, 0.9033203...","[0.0708007812, 0.0732421875, 0.0732421875, 0.0...","[0.3076171875, 0.3100585938, 0.3063964844, 0.3...","[0.1037597656, 0.0927734375, 0.0891113281, 0.0...","[0.1202392578, 0.10681152340000001, 0.09765625..."
3,-0.267249,2.325429,0.131011,-0.317401,-0.372143,0.24,"[-0.5786132812, -0.5834960938, -0.5224609375, ...","[1.3818359375, 1.396484375, 1.40625, 1.4208984...","[0.0659179688, 0.0659179688, 0.068359375, 0.06...","[0.322265625, 0.2990722656, 0.3137207031, 0.29...","[0.0909423828, 0.0726318359, 0.0823974609, 0.0...","[0.1239013672, 0.1013183594, 0.111694335900000..."
4,-0.060985,2.293559,0.174050,-0.402420,-0.430022,0.29,"[0.3002929688, 0.2563476562, 0.1708984375, 0.0...","[1.435546875, 1.4404296875, 1.4453125, 1.44531...","[0.0610351562, 0.0659179688, 0.0610351562, 0.0...","[0.3088378906, 0.3039550781, 0.3063964844, 0.2...","[0.0927734375, 0.0994873047, 0.09765625, 0.091...","[0.1086425781, 0.1245117188, 0.1141357422, 0.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
141,-1.398645,-1.428144,-0.681897,-1.608131,-0.267672,0.56,"[-0.1953125, -0.2294921875, -0.0439453125, 0.0...","[1.4013671875, 1.40625, 1.40625, 1.4013671875,...","[0.0634765625, 0.0610351562, 0.0634765625, 0.0...","[0.2856445312, 0.2880859375, 0.2807617188, 0.2...","[0.0836181641, 0.0866699219, 0.0854492188, 0.1...","[0.0939941406, 0.1031494141, 0.1019287109, 0.1..."
142,-1.192382,-1.460014,-0.638858,-1.693150,-0.325551,0.70,"[-0.68359375, -0.693359375, -0.546875, -0.3613...","[1.376953125, 1.38671875, 1.4013671875, 1.4111...","[0.056152343800000004, 0.0634765625, 0.0585937...","[0.2612304688, 0.263671875, 0.2697753906, 0.26...","[0.1025390625, 0.1141357422, 0.107421875, 0.10...","[0.1232910156, 0.1293945312, 0.1318359375, 0.1..."
143,-2.365982,-0.629131,0.971481,-1.602255,0.391525,0.24,"[-0.2001953125, -0.009765625, 0.0830078125, 0....","[1.40625, 1.4013671875, 1.396484375, 1.3867187...","[0.0659179688, 0.0708007812, 0.0634765625, 0.0...","[0.2795410156, 0.2844238281, 0.3015136719, 0.2...","[0.1147460938, 0.0952148438, 0.1019287109, 0.0...","[0.1397705078, 0.1123046875, 0.1202392578, 0.1..."
144,-2.035809,-0.686631,1.048674,-1.746196,0.287277,0.40,"[-0.205078125, -0.2392578125, -0.15625, 0.1269...","[1.3818359375, 1.38671875, 1.38671875, 1.38183...","[0.068359375, 0.05859375, 0.0708007812, 0.0659...","[0.2893066406, 0.3002929688, 0.2917480469, 0.3...","[0.0988769531, 0.0964355469, 0.0897216797, 0.0...","[0.1147460938, 0.1123046875, 0.106811523400000..."


In [99]:
#Extract Feature Dataframe

X = df_update[['pc1', 'pc2', 'pc4', 'pc4', 'pc5','smcAC','smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle' ]]

In [104]:
#Extract Reference Dataframe

y = df_update[['VB']]

In [106]:
# Train and Test data split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
X_train_ts = X_train[['smcAC','smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle']]
X_test_ts = X_test[['smcAC','smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle']]

X_train_nts = X_train[['pc1', 'pc2', 'pc4', 'pc4', 'pc5']]
X_test_nts = X_test[['pc1', 'pc2', 'pc4', 'pc4', 'pc5']]

print(len(X_train_ts))

116


In [140]:
#Convert Time series Training dataframe into a 3 dimensional numpy array 

n_train = len(X_train_ts)
n_ts_samples = 9000
n_ts_features = len(X_train_ts.columns)

X_train_ts_np = np.zeros((n_train,n_ts_samples,n_ts_features))

index=0

for col in ['smcAC','smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle']:
    
  X_train_ts_np[:,:,index] = np.array(X_train_ts[col].to_list())
  index += 1

In [141]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res


def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    x_ = SinePositionEncoding()(x)
    x= x + x_
    
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(1)(x) #no activation
    return keras.Model(inputs, outputs)

def model_training(train):
    """
    Train the data with the compatible model
    """
    input_shape = X_train_ts_np.shape[1:]
    model=build_model(
                        input_shape,
                        head_size=256,
                        num_heads=4,
                        ff_dim=4,
                        num_transformer_blocks=4,
                        mlp_units=[128],
                        mlp_dropout=0.4,
                        dropout=0.25,
                     )
    model.compile(
                    loss='mean_squared_error',
                    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
                    metrics=keras.metrics.MeanSquaredError(),
    )
    callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
    
    model.fit(
                X_train_ts_np,
                Y_train,
                validation_split=0.2,
                epochs=200,
                batch_size=16,
                callbacks=callbacks,
    )
    
    
    return model



# def metric(validation_data, model):
#     """
#     Standard metrics and plotting should be same
#     Metrics should be computed on validation data(unseen data)
#     1. Balanced accuracy score
#     2. Confusion matrix
#     3. Per-class accuracy
#     """
#     X_val, Y_val = validation_data
     #Y_pred = model.predict(X_val)> 0.5
#     ba=balanced_accuracy_score(Y_val, Y_pred)
#     cm=confusion_matrix(Y_val, Y_pred)
#     # cm_display = ConfusionMatrixDisplay(cm).plot()
#     metrics=[cm,ba]
#     return metrics

# def validation(metrics, metrics_validation):
#     """
#     Comparing the results with provided Series Embedder
#     Plot confusion matrices of self analysis and LSTM with balanced_accuracy
    
#     """
#     cm_model = ConfusionMatrixDisplay(metrics[0]).plot()
#     cm_lstm = ConfusionMatrixDisplay(metrics_validation[0]).plot()

    # metrics=metric(val,model_self)
    
#     lstm_cm,lstm_balanced_accuracy=lstm(preprocessed_data,target='labels')
#     metrics_validation = [lstm_cm, lstm_balanced_accuracy]
#     validation(metrics,metrics_validation) 