# Preprocessing

In [1]:
import scipy.io # To use the '.mat' files
import seaborn as sns
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
for root, dirs, files in os.walk("0hp_load_48_KHz", topdown=False):
    for file_name in files:
        path = os.path.join(root, file_name)
        print(path)

0hp_load_48_KHz\14_BA.mat
0hp_load_48_KHz\14_IR.mat
0hp_load_48_KHz\14_OR1.mat
0hp_load_48_KHz\21_BA.mat
0hp_load_48_KHz\21_IR.mat
0hp_load_48_KHz\21_OR1.mat
0hp_load_48_KHz\21_OR2.mat
0hp_load_48_KHz\21_OR3.mat
0hp_load_48_KHz\7_BA.mat
0hp_load_48_KHz\7_IR.mat
0hp_load_48_KHz\7_OR1.mat
0hp_load_48_KHz\7_OR2.mat
0hp_load_48_KHz\7_OR3.mat
0hp_load_48_KHz\N.mat


In [3]:
path = r'0hp_load_48_KHz/7_OR3.mat'
mat = scipy.io.loadmat(path)

In [4]:
key = list(mat.keys())[3]
data = mat.get(key)
data_table = pd.DataFrame(data)
data_table

Unnamed: 0,0
0,0.328152
1,0.240116
2,0.153332
3,0.070303
4,-0.000626
...,...
129964,0.103056
129965,0.137686
129966,0.161468
129967,0.235944


We will calculate following time domain features.

* Maximum value
* Minimum value
* Mean value 
* Standard deviation (Unbiased std)
* Root mean square value (RMS)
* Skewness
* Kurtosis
* Crest factor = $\frac{\text{Max}}{\text{RMS}}$
* Form factor = $\frac{\text{RMS}}{\text{Mean}}$

In [5]:
def compute_kurtosis(x):
    
    n = len(x)
    fourth_moment = np.sum((x - np.mean(x))**4) / n
    s_4 = np.std(x, ddof = 1) ** 4
    return fourth_moment / s_4 - 3

In [6]:
def compute_skewness(x):
    
    n = len(x)
    third_moment = np.sum((x - np.mean(x))**3) / n
    s_3 = np.std(x, ddof = 1) ** 3
    return third_moment/s_3

# Tạo ma trận thông kê từ 0hp_load_48_KHz/7_OR3.mat

In [34]:
feature_matrix = pd.DataFrame()

data_splits = np.array_split(data, 1300)
for i in range (0,len(data_splits)):
    temp =[]
    temp = data_splits[i]
    row_matrix = np.zeros((1,9))
    row_matrix[0,0] = np.max(temp)
    row_matrix[0,1] = np.min(temp)
    row_matrix[0,2] = np.mean(temp)
    row_matrix[0,3] = np.std(temp, ddof = 1)
    row_matrix[0,4] = np.sqrt(np.mean(temp ** 2))
    row_matrix[0,5] = compute_skewness(temp)
    row_matrix[0,6] = compute_kurtosis(temp)
    row_matrix[0,7] = row_matrix[0,0]/row_matrix[0,4]
    row_matrix[0,8] = row_matrix[0,4]/row_matrix[0,2]
    df = pd.DataFrame(row_matrix)
    df = df.reset_index(drop=True)
    feature_matrix = feature_matrix._append(df)
    # feature_matrix = feature_matrix._append(df)

# Thêm cột lỗi
fault = np.full((len(feature_matrix),1), file_name[:-4])
feature_matrix[9] = fault
# Sắp xếp lại thứ tự
# feature_matrix = feature_matrix.sort_index()
# Thêm nhãn cho các cột
feature_matrix.columns = ['Max','Min','Mean','Std','RMS','Skewness','Kurtosis','Crest Factor','Form Factor', 'Fault']
# feature_matrix.to_csv('feature_matrix_Bearing_fault_{}.csv'.format(file_name[:-4]))

In [None]:
feature_matrix

# Tạo ma trận từ tất cả các file

In [32]:

all_fault = pd.DataFrame()

for root, dirs, files in os.walk("0hp_load_48_KHz", topdown=False):
    for file_name in files:
        
        path = os.path.join(root, file_name)
        mat = scipy.io.loadmat(path)
        print(path)
        key = list(mat.keys())[3]
        data = mat.get(key)
        
        feature_matrix = pd.DataFrame()
        data_splits = np.array_split(data, 1300)
        for i in range (0,len(data_splits)):
            temp = data_splits[i]
            row_matrix = np.zeros((1,9))
            row_matrix[0,0] = np.max(temp)
            row_matrix[0,1] = np.min(temp)
            row_matrix[0,2] = np.mean(temp)
            row_matrix[0,3] = np.std(temp, ddof = 1)
            row_matrix[0,4] = np.sqrt(np.mean(temp ** 2))
            row_matrix[0,5] = compute_skewness(temp)
            row_matrix[0,6] = compute_kurtosis(temp)
            row_matrix[0,7] = row_matrix[0,0]/row_matrix[0,4]
            row_matrix[0,8] = row_matrix[0,4]/row_matrix[0,2]
            df = pd.DataFrame(row_matrix)
            feature_matrix = pd.concat([df,feature_matrix], axis = 0)
            # feature_matrix = feature_matrix._append(df)

        # Thêm cột lỗi
        fault = np.full((len(feature_matrix),1), file_name[:-4])
        feature_matrix[9] = fault
        # Sắp xếp lại thứ tự
        feature_matrix = feature_matrix.sort_index()
        # Thêm nhãn cho các cột
        feature_matrix.columns = ['Max','Min','Mean','Std','RMS','Skewness','Kurtosis','Crest Factor','Form Factor', 'Fault']
        # Tạo ra file csv
        feature_matrix.to_csv('feature_matrix_Bearing_fault_{}.csv'.format(file_name[:-4]),index=False)
        all_fault = all_fault._append(feature_matrix)
        del feature_matrix
                

0hp_load_48_KHz\14_BA.mat
0hp_load_48_KHz\14_IR.mat
0hp_load_48_KHz\14_OR1.mat
0hp_load_48_KHz\21_BA.mat
0hp_load_48_KHz\21_IR.mat
0hp_load_48_KHz\21_OR1.mat
0hp_load_48_KHz\21_OR2.mat
0hp_load_48_KHz\21_OR3.mat
0hp_load_48_KHz\7_BA.mat
0hp_load_48_KHz\7_IR.mat
0hp_load_48_KHz\7_OR1.mat
0hp_load_48_KHz\7_OR2.mat
0hp_load_48_KHz\7_OR3.mat
0hp_load_48_KHz\N.mat


In [35]:
df1 = pd.read_csv('feature_matrix_Bearing_fault_7_BA.csv')
df2 = pd.read_csv('feature_matrix_Bearing_fault_7_IR.csv')
df3 = pd.read_csv('feature_matrix_Bearing_fault_7_OR1.csv')
df4 = pd.read_csv('feature_matrix_Bearing_fault_7_OR2.csv')
df5 = pd.read_csv('feature_matrix_Bearing_fault_7_OR3.csv')
df = pd.concat([df1,df2,df3,df4,df5])
# all_fault.to_csv('feature_matrix_Bearing_all_fault.csv')

# SVM Training

In [36]:
X = df.iloc[:,:9]
y = df.iloc[:,9]

#Lấy ngẫu nhiên 1/2 bộ dữ liệu
# sample = pd.DataFrame()
# sample = all_fault.sample(n = int(len(all_fault)/2))
# X_sample = sample.iloc[:,:9]
# y_sample = sample.iloc[:,9]


In [37]:
X

Unnamed: 0,Max,Min,Mean,Std,RMS,Skewness,Kurtosis,Crest Factor,Form Factor
0,1.496140,-1.484455,0.007574,0.620992,0.619376,-0.041264,-0.030887,2.415562,81.771266
1,5.231273,-4.850248,0.029874,1.966988,1.961949,0.067251,0.114224,2.666365,65.674300
2,0.621827,-0.612645,0.004548,0.175455,0.175044,0.232886,1.463115,3.552393,38.485995
3,2.827851,-2.650067,-0.012208,1.131340,1.128377,-0.010299,-0.417208,2.506122,-92.432653
4,2.161369,-1.564165,0.045503,0.438901,0.440085,1.005534,7.294407,4.911257,9.671614
...,...,...,...,...,...,...,...,...,...
1295,0.547198,-0.204234,0.139587,0.191731,0.236385,0.219789,-0.718637,2.314864,1.693461
1296,0.546990,-0.291227,0.119993,0.192735,0.226216,-0.186659,-0.110010,2.417998,1.885235
1297,0.518826,-0.309794,0.134127,0.228751,0.264185,-0.098171,-1.338364,1.963876,1.969660
1298,0.396369,-0.131428,0.128217,0.145628,0.193481,0.088255,-1.045008,2.048616,1.509014


In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20)

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.01, 0.1, 0.5, 1, 5, 10],
    'gamma': [0.01, 0.1, 0.5, 1, 5, 10]
}

grid = GridSearchCV(SVC(), parameters, cv=5, verbose=2, refit=True)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [16]:
# Khởi tạo mô hình SVM
svm_model = SVC(kernel='linear', C=10, gamma=0.01)

# Huấn luyện mô hình
svm_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
# Đánh giá độ chính xác trên tập test
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the test set:", accuracy)