In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
from scipy.stats import pearsonr
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tqdm import tqdm

import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
from joblib import dump, load

In [3]:
fs=100
def extrtact_features(mse, Pxx_den):
    # 均值 方差
    f_m=np.mean(Pxx_den)
    f_std= np.std(Pxx_den)
    # 四分位距number of outlier
    Q1 = np.percentile(Pxx_den, 25)
    Q3 = np.percentile(Pxx_den, 75)
    IQR = Q3 - Q1
    num_outlier = sum((Pxx_den < (Q1 - 1.5 * IQR)) | (Pxx_den > (Q3 + 1.5 * IQR)))
    #MSE
    f_mse=sum(np.square(Pxx_den-np.mean(Pxx_den)))/len(Pxx_den)
    
    p_mse = sum(np.square(Pxx_den - np.mean(Pxx_den))) / len(Pxx_den)
    
    #Max diff
    f_md=np.max(Pxx_den)-np.min(Pxx_den)
    return mse,f_m,f_std,IQR,num_outlier,p_mse,f_mse,f_md

def all_features(ex,ey,ez):
    ex_mse = sum(np.square(ex - np.mean(ex))) / len(ex)
    ey_mse = sum(np.square(ey - np.mean(ey))) / len(ey)
    ez_mse = sum(np.square(ez - np.mean(ez))) / len(ez)
    ex,ey,ez=ex-np.mean(ex), ey-np.mean(ey), ez-np.mean(ez)
    sos = signal.butter(2, 35, 'lowpass',fs=100,output='sos')
    ex = signal.sosfilt(sos, ex)
    ey = signal.sosfilt(sos, ey)
    ez = signal.sosfilt(sos, ez)
    #calculate PSD
    fre, Pxx_den = signal.periodogram(ex, fs)
    Pxx_den=10*np.log10(abs(Pxx_den))[1:]
    fre, Pyy_den = signal.periodogram(ey, fs)
    Pyy_den=10*np.log10(abs(Pyy_den))[1:]
    fre, Pzz_den = signal.periodogram(ez, fs)
    Pzz_den=10*np.log10(abs(Pzz_den))[1:]
    fx=extrtact_features(ex_mse, Pxx_den)
    fy=extrtact_features(ey_mse, Pyy_den)
    fz=extrtact_features(ez_mse, Pzz_den)
    return [fx,fy,fz]

In [13]:
# 初步分好的数据，选择文件数量较少的两天
# 读取分好的文件路径
path='C:\xxx\dataset\init_label\\'
True_path, False_path=[], []
# folder_210=os.listdir(path+'2_10\\')
o_path='C:\xxx\dataset\EQ_20220210_172034\\'
for f in os.listdir(path+'2_10\True\\'):xxx
    True_path.append(o_path+f[:-4]+'.csv')
for f in os.listdir(path+'2_10\False\\'):
    False_path.append(o_path+f[:-4]+'.csv')

o_path='C:\xxx\dataset\EQ_90s_02and03_15\EQ_90s_02_15\\'
for f in os.listdir(path+'2_15\True\\'):
    True_path.append(o_path+f[:-4]+'.csv')
for f in os.listdir(path+'2_15\False\\'):
    False_path.append(o_path+f[:-4]+'.csv')

o_path='C:\xxx\dataset\EQ_90s_02and03_15\EQ_90s_03_15\\'
for f in os.listdir(path+'3_15\True\\'):
    True_path.append(o_path+f[:-4]+'.csv')
for f in os.listdir(path+'3_15\False\\'):
    False_path.append(o_path+f[:-4]+'.csv')

In [None]:
# 根据分好的文件路径读取数据，获取初步特征，并设置初步标签
X, Y=[], []
for f in True_path:
    df=pd.read_csv(f)
    df=np.array(df)[:,:3]
    ex,ey,ez=df[:,0],df[:,1],df[:,2]
    X.append(all_features(ex,ey,ez))
    Y.append(1)
for f in False_path:
    df=pd.read_csv(f)
    df=np.array(df)[:,:3]
    ex,ey,ez=df[:,0],df[:,1],df[:,2]
    X.append(all_features(ex,ey,ez))
    Y.append(0)
X=np.array(X)
X=X.reshape(X.shape[0],-1)
Y=np.array(Y)


In [33]:
pd.value_counts(Y)

0    652
1    216
dtype: int64

In [34]:
# z-score 正则化
ss = StandardScaler()
z_X = ss.fit_transform(X)

In [84]:
# 使用初步划分的训练集训练一个机器学习模型
from sklearn.ensemble import RandomForestClassifier
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size = 0.3,random_state = 42)

clf = RandomForestClassifier(n_estimators=15)
clf.fit(x_train, y_train)
label=clf.predict(x_test)

In [86]:
print(confusion_matrix(label, y_test))
print(accuracy_score(label, y_test))

[[176  33]
 [ 24  28]]
0.7816091954022989


In [None]:
#保存模型
from joblib import dump, load
dump(clf, './models/rf.model')

In [108]:
#读取模型并测试
rf=load('./models/rf.model')
label=rf.predict(x_test)
print(confusion_matrix(label, y_test))
print(accuracy_score(label, y_test))

[[176  33]
 [ 24  28]]
0.7816091954022989


In [104]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train, y_train)
label=clf.predict(x_test)
print(confusion_matrix(label, y_test))
print(accuracy_score(label, y_test))

[[183  44]
 [ 17  17]]
0.7662835249042146


In [106]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200)
clf.fit(x_train, y_train)
label=clf.predict(x_test)
print(confusion_matrix(label, y_test))
print(accuracy_score(label, y_test))

[[175  29]
 [ 25  32]]
0.7931034482758621


In [109]:
dump(clf, './models/gboost.model')
gboost=load('./models/gboost.model')
label=gboost.predict(x_test)
print(confusion_matrix(label, y_test))
print(accuracy_score(label, y_test))

[[175  29]
 [ 25  32]]
0.7931034482758621
