-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataPreProcess.py
115 lines (82 loc) · 3.61 KB
/
DataPreProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 27 13:34:14 2017
@author: Zhdun
"""
from copy import deepcopy
import numpy as np
from SMOTERDE import SMOTERDE
import SMOTER
import random
class DataPreProcessing:
"""
用来对数据集做各种处理,比如SMOTE、布鲁克过滤法等。
"""
def __init__(self,data,targetData,burakKNN,burak,smoter,rusRatio,rus,Drange,metric,modelName,F,CR,PopulationSize,Lives,smoterde):
self.data=deepcopy(data)#原始数据
self.targetData=deepcopy(targetData)#目标数据集,使用布鲁克过滤法会用到
self.burakKNN=burakKNN#布鲁克过滤法中,通过一个目标实例选择burakKNN个实例
self.burak=burak#是否使用布鲁克过滤法
self.smoter=smoter#是否使用smote过采样
self.rusRatio=rusRatio#使用欠随机采样时,多数类留下来的比例,[0,100]
self.rus=rus#是否使用随机欠采样
self.Drange=Drange
self.metric=metric
self.modelName=modelName
self.F=F
self.CR=CR
self.PopulationSize=PopulationSize
self.Lives=Lives
self.bestParas=None
self.smoterde=smoterde#是否使用差分进化smote过采样方法
def __burak(self):
trainingSet=self.data.iloc[:,0:self.data.shape[1]-1]
testSet=self.targetData.iloc[:,0:self.targetData.shape[1]-1]
trainingSetIndexs=set()
eucs = np.zeros((trainingSet.shape[0],testSet.shape[0]))
trSet=np.array(trainingSet)
teSet=np.array(testSet)
for col in range(0,trainingSet.shape[0]):
for row in range(0,testSet.shape[0]):
eucs[col,row]=np.linalg.norm(trSet[col,:]-teSet[row,:])
for col in range(0,eucs.shape[1]):#对于每个测试集实例
for i in range(0,self.burakKNN):
t=eucs[:,col].argmin()
trainingSetIndexs.add(t)
eucs[t,col]=float("inf")
self.data=(deepcopy(self.data).iloc[list(trainingSetIndexs),:]).reset_index(drop=True)
def __rus(self):
d=deepcopy(self.data)
PIndexs,NIndexs=[],[]
for i in range(len(d)):
if(d.iloc[i,-1]==0):
NIndexs.append(i)
else:
PIndexs.append(i)
if(self.rusRatio is None):
self.rusRatio=100.0*len(PIndexs)/len(NIndexs)
NIndexs=random.sample(NIndexs,int(self.rusRatio/100.0*len(NIndexs)))
NIndexs=PIndexs+NIndexs
self.data= d.iloc[NIndexs,:].reset_index(drop=True)
#######################################################
# td=deepcopy(self.data)
# t1=len(td[td['bug']==0])
# print(t1,len(td))
#########################################################
def __SMOTER(self):
s=SMOTER.SMOTER(k=5,m=6,r=2,data=self.data)
self.data=s.smoteR()
def __SMOTERDE(self):
su=SMOTERDE(data=self.data,Drange=self.Drange,metric=self.metric,modelName=self.modelName,F=self.F,
CR=self.CR,PopulationSize=self.PopulationSize,Lives=self.Lives)
self.data,self.bestParas= su.getData()
def preProcess(self):
if(self.burak):
self.__burak()
if(self.rus):
self.__rus()
elif(self.smoter):
self.__SMOTER()
elif(self.smoterde):
self.__SMOTERDE()
return self.data