In [1]:
# importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
import time
import datasource_config

In [2]:
# using config file for source data path
path = datasource_config.CLASSIFICATION_SOURCE_DATA_PATH

# read the data and keep required columns
df = pd.read_parquet(path)
df1 = df[['fault_class', 'individual', 'po', 'pdmp','pin']]

# display the dataframe
print("Input data\n")
display(df1)

Input data



Unnamed: 0,fault_class,individual,po,pdmp,pin
0,3,5,"[0.230999, 0.146074, 0.045333, -0.082227, -0.2...","[-0.194942, -0.187465, -0.180348, -0.173447, -...","[-1.731534, -1.755668, -1.738321, -1.691324, -..."
1,8,5,"[0.267242, 0.181471, 0.066593, -0.058671, -0.1...","[-0.246213, -0.273524, -0.308485, -0.348401, -...","[-1.553456, -1.535349, -1.530819, -1.498527, -..."
2,4,5,"[0.470566, 0.445597, 0.413041, 0.369268, 0.310...","[0.276173, 0.256051, 0.233923, 0.207697, 0.179...","[-1.322575, -1.380218, -1.382825, -1.375426, -..."
3,2,5,"[-0.102403, -0.197452, -0.314519, -0.436627, -...","[0.0658, 0.057669, 0.051469, 0.047736, 0.04593...","[-1.409725, -1.365621, -1.333474, -1.283352, -..."
4,2,5,"[-0.0248, -0.119669, -0.252136, -0.41008, -0.5...","[0.27768, 0.272525, 0.265093, 0.256061, 0.2469...","[-1.42767, -1.41073, -1.396357, -1.354666, -1...."
...,...,...,...,...,...
34140,1,6,"[-0.559606, -0.681762, -0.706164, -0.698716, -...","[-0.051017, -0.055426, -0.061033, -0.070764, -...","[-1.191121, -0.940782, -0.640797, -0.310252, -..."
34141,1,6,"[0.378552, 0.316211, 0.229241, 0.119138, -0.00...","[0.027704, 0.00884, -0.005844, -0.01568, -0.02...","[-1.671334, -1.706622, -1.711954, -1.683148, -..."
34142,1,6,"[0.303343, 0.21896, 0.113885, -0.00383, -0.147...","[-0.037775, -0.053561, -0.073233, -0.097449, -...","[-1.728436, -1.737226, -1.713788, -1.683285, -..."
34143,1,6,"[0.420105, 0.371285, 0.301655, 0.215862, 0.128...","[0.095459, 0.086675, 0.078642, 0.068852, 0.057...","[-1.537821, -1.593619, -1.599173, -1.589957, -..."


In [4]:
# data pre-processing to expand array values into single columns
data = df1

pdmp_df = pd.DataFrame(data.pdmp.tolist())
pin_df = pd.DataFrame(data.pin.tolist())
po_df = pd.DataFrame(data.po.tolist())
intr_df = pd.merge(pdmp_df, pin_df, left_index=True, right_index=True)
intr2_df = pd.merge(intr_df, po_df, left_index=True, right_index=True)
final_df = pd.merge(data, intr2_df, left_index=True, right_index=True).drop(["po", "pdmp", "pin"], axis=1)
sensor_data = final_df.add_prefix('sensor' + '_')

# display the data after splitting he array data
print("Dataset after splitting the array data\n")
display(sensor_data)

Dataset after splitting the array data



Unnamed: 0,sensor_fault_class,sensor_individual,sensor_0_x,sensor_1_x,sensor_2_x,sensor_3_x,sensor_4_x,sensor_5_x,sensor_6_x,sensor_7_x,...,sensor_561,sensor_562,sensor_563,sensor_564,sensor_565,sensor_566,sensor_567,sensor_568,sensor_569,sensor_570
0,3,5,-0.194942,-0.187465,-0.180348,-0.173447,-0.168120,-0.164938,-0.162964,-0.160443,...,1.050622,1.034456,1.006829,0.981712,0.982890,0.995431,0.997414,0.981866,0.969041,0.971418
1,8,5,-0.246213,-0.273524,-0.308485,-0.348401,-0.388465,-0.423755,-0.450124,-0.466246,...,0.912746,0.908318,0.922986,0.935062,0.936170,0.941296,0.953447,0.959849,0.951849,0.934557
2,4,5,0.276173,0.256051,0.233923,0.207697,0.179566,0.149646,0.118753,0.088254,...,0.921702,0.926112,0.928393,0.930782,0.940371,0.956150,0.957770,0.939279,0.927833,0.935658
3,2,5,0.065800,0.057669,0.051469,0.047736,0.045938,0.047479,0.052721,0.061419,...,0.422891,0.428285,0.432260,0.430246,0.419753,0.400856,0.375741,0.351293,0.335347,0.331829
4,2,5,0.277680,0.272525,0.265093,0.256061,0.246930,0.238751,0.234863,0.237788,...,0.388398,0.383101,0.390316,0.395677,0.395287,0.384548,0.363969,0.339599,0.317733,0.305070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34140,1,6,-0.051017,-0.055426,-0.061033,-0.070764,-0.087975,-0.112000,-0.141544,-0.174510,...,0.945983,0.945088,0.944416,0.952775,0.959355,0.946094,0.927905,0.931097,0.954974,0.976086
34141,1,6,0.027704,0.008840,-0.005844,-0.015680,-0.020587,-0.017767,-0.007995,0.007947,...,0.948468,0.960261,0.954239,0.927050,0.902973,0.902458,0.920599,0.937222,0.946607,0.952880
34142,1,6,-0.037775,-0.053561,-0.073233,-0.097449,-0.124788,-0.155356,-0.188065,-0.222948,...,0.951262,0.939103,0.912608,0.901967,0.918477,0.943463,0.955312,0.953058,0.956575,0.964654
34143,1,6,0.095459,0.086675,0.078642,0.068852,0.057889,0.044362,0.029811,0.012646,...,0.915304,0.936355,0.945232,0.937995,0.914667,0.887997,0.889253,0.923887,0.949244,0.947865


In [6]:
# selecting input features and target class
features = sensor_data.iloc[:, 1:]
target = sensor_data.iloc[:, 0]

# Displaying the input features and target classes
print("Input features\n")
display(features)
print("Target classes\n")
display(target)

Input features



Unnamed: 0,sensor_individual,sensor_0_x,sensor_1_x,sensor_2_x,sensor_3_x,sensor_4_x,sensor_5_x,sensor_6_x,sensor_7_x,sensor_8_x,...,sensor_561,sensor_562,sensor_563,sensor_564,sensor_565,sensor_566,sensor_567,sensor_568,sensor_569,sensor_570
0,5,-0.194942,-0.187465,-0.180348,-0.173447,-0.168120,-0.164938,-0.162964,-0.160443,-0.157301,...,1.050622,1.034456,1.006829,0.981712,0.982890,0.995431,0.997414,0.981866,0.969041,0.971418
1,5,-0.246213,-0.273524,-0.308485,-0.348401,-0.388465,-0.423755,-0.450124,-0.466246,-0.472565,...,0.912746,0.908318,0.922986,0.935062,0.936170,0.941296,0.953447,0.959849,0.951849,0.934557
2,5,0.276173,0.256051,0.233923,0.207697,0.179566,0.149646,0.118753,0.088254,0.057600,...,0.921702,0.926112,0.928393,0.930782,0.940371,0.956150,0.957770,0.939279,0.927833,0.935658
3,5,0.065800,0.057669,0.051469,0.047736,0.045938,0.047479,0.052721,0.061419,0.076642,...,0.422891,0.428285,0.432260,0.430246,0.419753,0.400856,0.375741,0.351293,0.335347,0.331829
4,5,0.277680,0.272525,0.265093,0.256061,0.246930,0.238751,0.234863,0.237788,0.244928,...,0.388398,0.383101,0.390316,0.395677,0.395287,0.384548,0.363969,0.339599,0.317733,0.305070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34140,6,-0.051017,-0.055426,-0.061033,-0.070764,-0.087975,-0.112000,-0.141544,-0.174510,-0.208695,...,0.945983,0.945088,0.944416,0.952775,0.959355,0.946094,0.927905,0.931097,0.954974,0.976086
34141,6,0.027704,0.008840,-0.005844,-0.015680,-0.020587,-0.017767,-0.007995,0.007947,0.025575,...,0.948468,0.960261,0.954239,0.927050,0.902973,0.902458,0.920599,0.937222,0.946607,0.952880
34142,6,-0.037775,-0.053561,-0.073233,-0.097449,-0.124788,-0.155356,-0.188065,-0.222948,-0.261021,...,0.951262,0.939103,0.912608,0.901967,0.918477,0.943463,0.955312,0.953058,0.956575,0.964654
34143,6,0.095459,0.086675,0.078642,0.068852,0.057889,0.044362,0.029811,0.012646,-0.007764,...,0.915304,0.936355,0.945232,0.937995,0.914667,0.887997,0.889253,0.923887,0.949244,0.947865


Target classes



0        3
1        8
2        4
3        2
4        2
        ..
34140    1
34141    1
34142    1
34143    1
34144    1
Name: sensor_fault_class, Length: 34145, dtype: int32

In [None]:
# parameters for finding the best model that fits the data
parameter_grid = {'n_estimators': [200, 400, 500, 600, 700], 'min_samples_leaf': [55, 85, 100],
                  'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]}

number_models = 2
random_forest_model = RandomForestClassifier()

# searching the best set of parameters for classifier
classifier = RandomizedSearchCV(
    estimator=random_forest_model,
    param_distributions=parameter_grid,
    n_iter=number_models,
    scoring='accuracy',
    n_jobs=2,
    cv=5,
    refit=True,
    return_train_score=True)

classifier.fit(x_train, y_train)
predictions = classifier.predict(x_train)

In [8]:
# split the train and test data
x_train, x_test, y_train, y_test = train_test_split(features, classes, test_size=0.20)

# normalise the data to bring all the values into single scale range
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# creating Random Forest classifier object with the best parameters received from above function
classifier = RandomForestClassifier(n_estimators= classifier.best_params_['n_estimators'],
                                    criterion='entropy',
                                    min_samples_leaf= classifier.best_params_['min_samples_leaf'],
                                    max_features='sqrt', bootstrap=True)

# training the model
st_time = time.time()
classifier.fit(x_train, y_train)
end_tm = time.time()
print("Training Time : " + str(round(end_tm - st_time)) + " seconds\n")

train_pred = classifier.predict(x_train)
print("Training Accuracy : " + str((accuracy_score(y_train, train_pred) * 100)) + "%\n")

# testing the model on test data
y_pred = classifier.predict(x_test)

# performance evaluation using different metrics
r2_val = r2_score(y_test, y_pred)
print("R2 value : " + str(r2_val) + "\n")
print("Testing Accuracy : " + str((accuracy_score(y_test, y_pred) * 100)) + "%\n")

Training Time : 767 seconds

Training Accuracy : 98.33430956216137%

R2 value : 0.9652500446961111

Testing Accuracy : 97.94991946112168%

