#### 使用上一步初步训练的模型来为更多的数据打上初步标签，再进行人工处理划分

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
from scipy.stats import pearsonr
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tqdm import tqdm
 
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
from joblib import dump, load

In [3]:
fs=100
def extrtact_features(mse, Pxx_den):
    # 均值 方差
    f_m=np.mean(Pxx_den)
    f_std= np.std(Pxx_den)
    # 四分位距 number of outlier
    Q1 = np.percentile(Pxx_den, 25)
    Q3 = np.percentile(Pxx_den, 75)
    IQR = Q3 - Q1
    num_outlier = sum((Pxx_den < (Q1 - 1.5 * IQR)) | (Pxx_den > (Q3 + 1.5 * IQR)))
    #MSE
    f_mse=sum(np.square(Pxx_den-np.mean(Pxx_den)))/len(Pxx_den)
    
    p_mse = sum(np.square(Pxx_den - np.mean(Pxx_den))) / len(Pxx_den)
    
    #Max diff
    f_md=np.max(Pxx_den)-np.min(Pxx_den)
    return mse,f_m,f_std,IQR,num_outlier,p_mse,f_mse,f_md

def all_features(ex,ey,ez):
    ex_mse = sum(np.square(ex - np.mean(ex))) / len(ex)
    ey_mse = sum(np.square(ey - np.mean(ey))) / len(ey)
    ez_mse = sum(np.square(ez - np.mean(ez))) / len(ez)
    ex,ey,ez=ex-np.mean(ex), ey-np.mean(ey), ez-np.mean(ez)
    sos = signal.butter(2, 35, 'lowpass',fs=100,output='sos')
    ex = signal.sosfilt(sos, ex)
    ey = signal.sosfilt(sos, ey)
    ez = signal.sosfilt(sos, ez)
    #calculate PSD
    fre, Pxx_den = signal.periodogram(ex, fs)
    Pxx_den=10*np.log10(abs(Pxx_den))[1:]
    fre, Pyy_den = signal.periodogram(ey, fs)
    Pyy_den=10*np.log10(abs(Pyy_den))[1:]
    fre, Pzz_den = signal.periodogram(ez, fs)
    Pzz_den=10*np.log10(abs(Pzz_den))[1:]
    fx=extrtact_features(ex_mse, Pxx_den)
    fy=extrtact_features(ey_mse, Pyy_den)
    fz=extrtact_features(ez_mse, Pzz_den)
    return [fx,fy,fz]

In [5]:
# 读取需要划分的数据，并提取特征
path='C:\xxx\dataset\EQ_20210829_130323\\'
files=os.listdir(path)
files.sort()

X=[]
for i in tqdm(range(len(files))):
    f=path+files[i]
    df=pd.read_csv(f)
    df=np.array(df)[:,:3]
    ex,ey,ez=df[:,0],df[:,1],df[:,2]
    X.append(all_features(ex,ey,ez))
X=np.array(X)
X=X.reshape(X.shape[0],-1)


In [7]:
# 部分异常值处理
idx = np.isinf(X)
X[idx] = 0 
idx = np.isnan(X)
X[idx] = 0 

In [84]:
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

clf = RandomForestClassifier(n_estimators=15)
clf.fit(x_train, y_train)
label = clf.predict(x_test)

In [8]:
# 读取模型并预测没有标签的数据
from joblib import dump, load
rf=load('./models/gboost.model')
label=rf.predict(X)

In [9]:
# 保存预测数据的标签
np.save('./label/init_label/labels_8_29',label)