In [1]:
%load_ext autoreload
from script.feature_extractor import FeatureExtractor
from script.classifier import Classifier
from script.signal_manager import SignalMgr
from script.filter import Filter
from script.data_reader import DataReader
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import xgboost as xgb

%matplotlib inline

In [2]:
INVALID_SLIGHT_DATA_FPATH='/Volumes/workspace/projects/signal_classification/data/特殊次品样本/斜角_轻微.20190515/'
INVALID_BAD_DATA_FPATH='/Volumes/workspace/projects/signal_classification/data/特殊次品样本/斜角_严重.20190515/'
FULL_DATA_FAPTH='/Volumes/workspace/projects/signal_classification/data/1005_0830重新标注文件_Data._20180609.0830'
MISS_LABEL_NORMAL_FPATH='/Users/changkong/project/signal_classification/data/20190623标记/20190623NEW'  # 误分的正样本
GLUE_ABNORMAL_FPATH='/Volumes/workspace/projects/signal_classification/data/特殊次品样本/DATA-胶水/'

In [3]:
data_reader = DataReader()

In [4]:
norm_df = data_reader.create_single_index(FULL_DATA_FAPTH+'/'+'result.csv').drop(labels='sys_result', axis=1)
norm_df[norm_df.reason==4].describe()
norm_df.head()

Unnamed: 0,case_name,channel_id,case_path,expect_result,reason
0,20180325_090536,1,/Volumes/workspace/projects/signal_classificat...,0,-1
1,20180325_090637,1,/Volumes/workspace/projects/signal_classificat...,1,9
2,20180325_091016,1,/Volumes/workspace/projects/signal_classificat...,1,5
3,20180325_091047,1,/Volumes/workspace/projects/signal_classificat...,1,5
4,20180325_091103,1,/Volumes/workspace/projects/signal_classificat...,1,5


In [5]:
invalid_slight_df = data_reader.get_signal_list(INVALID_SLIGHT_DATA_FPATH).drop(labels=['sys_result'], axis=1)
invalid_slight_df['expect_result'] = 1
invalid_slight_df['reason'] = 61
invalid_slight_df.head()

Unnamed: 0,case_name,channel_id,case_path,expect_result,reason
0,20190515_204352497,1,/Volumes/workspace/projects/signal_classificat...,1,61
1,20190515_204352497,2,/Volumes/workspace/projects/signal_classificat...,1,61
2,20190515_204352497,3,/Volumes/workspace/projects/signal_classificat...,1,61
3,20190515_204352497,4,/Volumes/workspace/projects/signal_classificat...,1,61
4,20190515_204352497,5,/Volumes/workspace/projects/signal_classificat...,1,61


In [6]:
glue_df = data_reader.get_signal_list(GLUE_ABNORMAL_FPATH).drop(labels=['sys_result'], axis=1)
glue_df['expect_result'] = 1
glue_df['reason'] = 71
glue_df.head()

Unnamed: 0,case_name,channel_id,case_path,expect_result,reason
0,20190711_144110136,1,/Volumes/workspace/projects/signal_classificat...,1,71
1,20190711_144110136,2,/Volumes/workspace/projects/signal_classificat...,1,71
2,20190711_144110136,3,/Volumes/workspace/projects/signal_classificat...,1,71
3,20190711_144110136,4,/Volumes/workspace/projects/signal_classificat...,1,71
4,20190711_144110136,5,/Volumes/workspace/projects/signal_classificat...,1,71


In [7]:
invalid_bad_df = data_reader.get_signal_list(INVALID_BAD_DATA_FPATH).drop(labels=['sys_result'], axis=1)
invalid_bad_df['expect_result'] = 1
invalid_bad_df['reason'] = 62
invalid_bad_df.head()

Unnamed: 0,case_name,channel_id,case_path,expect_result,reason
0,20190515_203431979,1,/Volumes/workspace/projects/signal_classificat...,1,62
1,20190515_203431979,2,/Volumes/workspace/projects/signal_classificat...,1,62
2,20190515_203431979,3,/Volumes/workspace/projects/signal_classificat...,1,62
3,20190515_203431979,4,/Volumes/workspace/projects/signal_classificat...,1,62
4,20190515_203431979,5,/Volumes/workspace/projects/signal_classificat...,1,62


In [8]:
sigMgr = SignalMgr()
# feature = sigMgr.get_features('xxx', request_param={'skip_row':[1], 'model_path':['train']})

In [9]:
# 开始进行模型迭代和训练, 整合训练数据和测试数据

def data_prepare(train_path, test_path, eval_path):
    # 预留长短波形的数据用于数据的测试
    msk = np.random.rand(len(invalid_bad_df)) < 0.8
    invalid_bad_train_df = invalid_bad_df[msk]         #用于训练
    invalid_bad_eval_df = invalid_bad_df[~msk]         #用于最后验证
    
    msk = np.random.rand(len(invalid_slight_df)) < 0.8
    invalid_slight_train_df = invalid_slight_df[msk]
    invalid_slight_eval_df = invalid_slight_df[~msk]
    
    eval_mix_df = invalid_bad_eval_df.append(invalid_slight_eval_df).reset_index(drop=True)
    # 获取整体的训练数据
    train_mix_df = invalid_slight_train_df.append(invalid_bad_train_df).reset_index(drop=True)
    # 再次划分为测试集合与训练集合
    msk = np.random.rand(len(train_mix_df)) < 0.8
    train_df = train_mix_df[msk]
    test_df = train_mix_df[~msk]

    # pandas 写入到文件中进行缓存，用于迭代测试，避免出现每次划分数据集合auc发生变化
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    eval_mix_df.to_csv(eval_path, index=False)

In [10]:
# data_prepare("../data/train_skew.csv", "../data/test_skew.csv", "../data/eval_skew.csv")

In [11]:
data_root = '../data'
# 开始进行特征的获取
train_tmp_df = pd.read_csv(data_root + '/' + 'train.csv')
train_skew_df = pd.read_csv(data_root + '/' + 'train_skew.csv')
train_df = train_tmp_df.append(train_skew_df).reset_index(drop=True)

In [12]:
print (train_df['case_path'][0])

/Volumes/workspace/projects/signal_classification/data/特殊次品样本/长短.20190515/20190515_195250029/Channel_2.csv


In [13]:
MISSING_LABEL_DF_FPATH='/Users/changkong/project/signal_classification/data/20190623标记/result_man.csv'
missing_classify_possible_df = data_reader.get_signal_list(MISS_LABEL_NORMAL_FPATH).drop(labels=['sys_result'], axis=1)
label_df = pd.read_csv(MISSING_LABEL_DF_FPATH, header=None, skiprows=1, names=['case_name', 'sys_result', 'expect_result', 'reason', 'channel_id'])
target_df = label_df[label_df.expect_result == 0].reset_index(drop=True)
tmp = target_df[['case_name', 'channel_id']].merge(missing_classify_possible_df, on=['case_name', 'channel_id'])
missing_classify_possible_df[missing_classify_possible_df.case_name == '20190623_000136435'].head()
target_df[target_df.case_name == '20190623_000136435']

Unnamed: 0,case_name,sys_result,expect_result,reason,channel_id
0,20190623_000136435,1,0,-1,8


In [14]:
missing_positive_df = tmp
missing_positive_df['expect_result'] = 0

In [15]:
msk = np.random.rand(len(missing_positive_df)) < 0.8
missing_positive_train_df = missing_positive_df[msk]
missing_positive_eval_df = missing_positive_df[~msk]

In [16]:
# missing_positive_train_df.to_csv(data_root + '/' + 'mp_train.csv', index=False)
# missing_positive_eval_df.to_csv(data_root + '/' + 'mp_test.csv', index=False)
mp_train_df = pd.read_csv(data_root + '/' + 'mp_train.csv')
train_df = train_df.append(mp_train_df).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [17]:
test_tmp_df = pd.read_csv(data_root + '/' + 'test.csv')
test_skew_df = pd.read_csv(data_root + '/' + 'test_skew.csv')
test_mp_df = pd.read_csv(data_root + '/' + 'mp_test.csv')
test_df1 = test_tmp_df.append(test_skew_df).reset_index(drop=True)
test_df = test_df1.append(test_mp_df).reset_index(drop=True)

In [18]:
test_df.head()
test_mp_df.head()
mp_train_df.head()
test_mp_df.count()

case_name        261
channel_id       261
case_path        261
expect_result    261
dtype: int64

In [19]:
normal_cases = train_df[train_df.expect_result == 0]
other_defect_cases = train_df[(train_df.expect_result == 1) & (train_df.reason != 6) & (train_df.reason != 61) & (train_df.reason != 62)]
defet_cases = train_df[(train_df.reason == 6) | (train_df.reason == 61) | (train_df.reason == 62)]

In [20]:
def get_features(df):
    mu_list = []
    delta_list = []
    sigMgr = SignalMgr()
    for path in df['case_path']:
        feature = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
        mu_list.append(np.mean(feature['unit_interviene_length_diff']))
        delta_list.append(np.std(feature['unit_interviene_length_diff']))
    return np.min(mu_list), np.max(mu_list), np.min(delta_list), np.max(delta_list), np.mean(mu_list), np.mean(delta_list)

In [21]:
# 整体数据的区分很大，因此介入进行数据分析和训练
feature_names = ['peaks_num', 'down_peaks_num', 'up_edges_num', 'down_edges_num', 'peak_edge_ratio', 'down_peak_edge_ratio',
                 'edge_diff_10', 'edge_diff_20', 'width_diff_10', 'negative_peak_num', 'max_down_peak_point', 'inter_diff_mean', 'inter_diff_delta',
                'skewness_mean', 'skewness_delta', 'cyclic_intense_nopeak', 'cyclic_intense_downpeak']
# feature_names = ['peaks_num', 'down_peaks_num', 'up_edges_num', 'down_edges_num', 'peak_edge_ratio', 'down_peak_edge_ratio',
#                  'edge_diff_10', 'edge_diff_20', 'width_diff_10', 'negative_peak_num', 'max_down_peak_point', 'inter_diff_mean', 'inter_diff_delta',
#                 'skewness_mean', 'skewness_delta']

feature_names = sorted(feature_names, reverse=True)
                
def features(df_full, feature_names):
    pathes = df_full['case_path']
    # print pathes
    feature_set = dict()
    for name in feature_names:
        feature_set[name] = list()
#     feature_set['cyclic_intense_ratio'] = list()
#     feature_set['inter_diff_mean'] = list()
#     feature_set['inter_diff_delta'] = list()
#     feature_set['skewness_mean'] = list()
#     feature_set['skewness_delta'] = list()
#     feature_set['skewness_median'] = list()
#     feature_set['skewness_10'] = list()
#     feature_set['skewness_20'] = list()
#     feature_set['skewness_30'] = list()
#     feature_set['skewness_']
    
    for test_case in pathes:
        features = sigMgr.get_features(test_case, request_param={'skip_row':[1], 'model_path':['train']})
        for name in feature_names:
#             if name == 'cyclic_intense_downpeak':
#                 if features[name] >= 4:
#                     features[name] = 4
            feature_set[name].append(features[name])
#         feature_set['cyclic_intense_ratio'].append(features['cyclic_intense_downpeak'] / (features['down_peaks_num'] + 0.01))
#         feature_set['inter_diff_mean'].append(np.mean(features['unit_interviene_length_diff']))
#         feature_set['inter_diff_delta'].append(np.std(features['unit_interviene_length_diff']))
#         skewness_list = sorted(features['unit_interviene_skewness'], reverse=True)
#         feature_set['skewness_median'] = np.percentile(skewness_list, 50)
#         feature_set['skewness_10'] = np.percentile(skewness_list, 90)
#         feature_set['skewness_20'] = np.percentile(skewness_list, 80)
#         feature_set['skewness_30'] = np.percentile(skewness_list, 70)
#         feature_set['skewness_mean'].append(np.mean(features['unit_interviene_skewness']))
#         feature_set['skewness_delta'].append(np.std(features['unit_interviene_skewness']))
    
    return pd.DataFrame(feature_set)

In [None]:
train_x = features(train_df, feature_names)
train_y = train_df['expect_result']
test_x = features(test_df, feature_names)
test_y = test_df['expect_result']

train_y[train_y == -1] = 0
test_y[test_y == -1] = 0
test_x = test_x.fillna(0)
train_x = train_x.fillna(0)
# test_df = pd.read_csv(data_root + '/' + 'test.csv')
# test_x = features(test_df, feature_names)
# test_y = test_df['expect_result']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier as ada
from sklearn.linear_model import LogisticRegression as lg
from sklearn.metrics import classification_report

# new_features = ['edge_diff_20', 'inter_diff_mean', 'skewness_mean','down_edges_num', 'cyclic_intense_downpeak', 'peaks_num', 'peak_edge_ratio', 'negative_peak_num', 'down_peaks_num', 'down_peak_edge_ratio']
# test_x_tmp = test_x[new_features]
# train_x_tmp = train_x[new_features]
train_x_tmp = train_x
test_x_tmp = test_x
gdbtModel = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, min_samples_split=3)
gdbtModel.fit(train_x_tmp, train_y)
pResult = gdbtModel.predict(test_x_tmp)
print(classification_report(test_y, pResult))

In [None]:
from sklearn.externals import joblib
# joblib.dump(gdbtModel, '../production/model')
# sum(pResult[-217:-1])

In [None]:
train_x.describe()

In [None]:
for i, j in zip(feature_names, gdbtModel.feature_importances_):
    print (i, j)

In [None]:
df1 = pd.DataFrame({'intense': train_x['cyclic_intense_downpeak'], 'label':train_y})
dd1 = df1.groupby(['intense']).count().reset_index(drop=True)

In [None]:
dd1

In [None]:
dd2 = df1.groupby(['intense']).sum().reset_index(drop=True)

In [None]:
dd1['false_count'] = dd2['label']

In [None]:
dd1['ratio'] = dd1['false_count'] * 100.0 / dd1['label']

In [None]:
dd1

In [None]:
# print len(feature_names)
glue_df_x = features(glue_df, feature_names)
glue_df_y = glue_df['expect_result']

pResult = gdbtModel.predict(glue_df_x)
print(classification_report(glue_df_y, pResult))

In [None]:
miss_classify_df = glue_df[pResult == 0].reset_index(drop=True)

In [None]:
miss_features_x = features(miss_classify_df, feature_names)
miss_features_x.head(100)

In [None]:
path = miss_classify_df['case_path'][0]
signals = pd.read_csv(path, skiprows=1, header=None)
signals.plot()

In [None]:
feas = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
normalized_signals = feas['normalized_signals'] 
medfiltered_signals = Filter.medfilter(normalized_signals, 9)
peaks = normalized_signals - medfiltered_signals

In [None]:
pd.Series(peaks).plot()

In [None]:
pd.Series(Filter.nms(peaks, 9, True)).plot()

In [None]:
np.arange(0, len(peaks))[peaks < -1.0]

In [None]:
feas['down_peaks']

In [None]:
feas = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
pd.Series(feas['normalized_signals']).plot()

In [None]:
new_features = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
print (new_features['cyclic_downpeak_seq'])

In [None]:
print (new_features['cyclic_intense_downpeak'])

In [None]:
pd.Series(new_features['normalized_signals']).plot()

In [None]:
# dtest = xgb.DMatrix(glue_df_x, label=glue_df_y)
# predict = bst.predict(dtest)
# # print(classification_report(test_y, pResult))
# result = list()
# for score in predict:
#     if score >= 0.5:
#         result.append(1)
#     else:
#         result.append(0)
# print(classification_report(glue_df_y, result))

In [None]:
# 开始拉须调研，判断更多的误差信息
GLUE_ABNORMAL_FPATH='/Volumes/workspace/projects/signal_classification/data/特殊次品样本/DATA-胶水/'
data_reader = DataReader()
glue_df = data_reader.get_signal_list(GLUE_ABNORMAL_FPATH)
glue_df['expect_result'] = 1

In [None]:
path = glue_df.head(100)['case_path']
signals = pd.read_csv(path, skiprows=1)
signals.plot()
feas = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
print (feas['cyclic_intense_downpeak'])

In [None]:
# 开始胶水的波形调研，先看用现有的基线能得到多好的测试结果
data_reader = DataReader()

nohead_cyclic_intense = []
downpeak_cyclic_intense = []
for path in glue_df['case_path']:
    feas = sigMgr.get_features(path, request_param={'skip_row':[1], 'model_path':['train']})
    nohead_cyclic_intense.append(feas['cyclic_intense_nopeak'])
    downpeak_cyclic_intense.append(feas['cyclic_intense_downpeak'])

In [None]:
print (np.mean(nohead_cyclic_intense), np.std(downpeak_cyclic_intense))