# Resnet50 + autoencoder로 각각 이미지와 시계열 셋을 전처리한 후 XGB로 파인튠하여 최종분류
*부트스트랩으로 언밸런스데이터 오버샘플링해보기

*해결과제
1. 10번쨰열 결측치 처리
2. 데이터 중복시간있는것 처리 -> 10번째열 있는 자료만 쓰려고 하였으나 10번쨰열이 없는 데이터도 상당히 존재함
3. 1~9 -> 10 하는 간단한 모델 적합 후 결측치 처리하는 방향으로 고려중

In [1]:
#import
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
from glob import glob
import os
import json 
import random
from xgboost import XGBClassifier
from tensorflow.keras.applications.resnet import ResNet50
import tensorflow as tf
from tensorflow import keras
from sklearn import metrics


In [2]:
path = os.getcwd()

In [3]:
# 제공된 sample data는 파프리카와 시설포도 2종류의 작물만 존재
label_description = {
 '3_00_0': '파프리카_정상',
 '3_a9_1': '파프리카흰가루병_초기',
 '3_a9_2': '파프리카흰가루병_중기',
 '3_a9_3': '파프리카흰가루병_말기',
 '3_a10_1': '파프리카잘록병_초기',
 '3_a10_2': '파프리카잘록병_중기',
 '3_a10_3': '파프리카잘록병_말기',
 '3_b3_1': '칼슘결핍_초기',
 '3_b3_2': '칼슘결핍_중기',
 '3_b3_3': '칼슘결핍_말기',
 '3_b6_1': '다량원소결핍 (N)_초기',
 '3_b6_2': '다량원소결핍 (N)_중기',
 '3_b6_3': '다량원소결핍 (N)_말기',
 '3_b7_1': '다량원소결핍 (P)_초기',
 '3_b7_2': '다량원소결핍 (P)_중기',
 '3_b7_3': '다량원소결핍 (P)_말기',
 '3_b8_1': '다량원소결핍 (K)_초기',
 '3_b8_2': '다량원소결핍 (K)_중기',
 '3_b8_3': '다량원소결핍 (K)_말기',
 '6_00_0': '시설포도_정상',
 '6_a11_1': '시설포도탄저병_초기',
 '6_a11_2': '시설포도탄저병_중기',
 '6_a11_3': '시설포도탄저병_말기',
 '6_a12_1': '시설포도노균병_초기',
 '6_a12_2': '시설포도노균병_중기',
 '6_a12_3': '시설포도노균병_말기',
 '6_b4_1': '일소피해_초기',
 '6_b4_2': '일소피해_중기',
 '6_b4_3': '일소피해_말기',
 '6_b5_1': '축과병_초기',
 '6_b5_2': '축과병_중기',
 '6_b5_3': '축과병_말기',
}

label_encoder = {key:idx for idx, key in enumerate(label_description)}
label_decoder = {val:key for key, val in label_encoder.items()}

In [190]:
class DataController():
    def __init__(self,csvfeatures,csvfeaturedict):
        self.csvfeatures = csvfeatures
        self.csvfeaturedict = csvfeaturedict
    
    def road_csv(self,foldnam,timenum):
        df = pd.read_csv(foldnam)
        return df.loc[:timenum-1,self.csvfeatures]      #csv파일 제일 짧은게 291개임 오류인지는 모르겠으나 일단 최소길이로 통일하여 처리
    
    def scaling(self,minmaxdic,df):
        for col in minmaxdic.keys():
            df.loc[:,col] = df.loc[:,col] - self.csvfeaturedict[col][0]
            df.loc[:,col] = df.loc[:,col] / (self.csvfeaturedict[col][1]-self.csvfeaturedict[col][0])
        return df
    
    
    def getimage(self,imgpath):
        img = cv2.imread(imgpath)
        img = cv2.resize(img, dsize=(224, 224), interpolation=cv2.INTER_AREA)
        img = img.astype(np.float32)/255  ##픽셀값을 0~1사이로 정규화
        # img = np.transpose(img, (2,0,1))
        return img
    
    def getlable(self,jsonpath):
        with open(jsonpath, 'r') as f:
            json_file = json.load(f)

        crop = json_file['annotations']['crop']
        disease = json_file['annotations']['disease']
        risk = json_file['annotations']['risk']
        label = f'{crop}_{disease}_{risk}'
        return label
    
    def getdata(self,datapath,timenum,featnum):
        imagesize = 224
        
        
        csvarr = np.empty((0,timenum,featnum), float)
        imgarr = np.empty((0,imagesize,imagesize,3), float)
        lablearr = np.array([])
        
        # predictor = np.append(predictor,ndf.reshape(-1,timenum,featnum),axis = 0)
        
        for ind,i in enumerate(datapath):
            
            if glob(i + '/*.csv') == []:  #10462폴더 비어있음. 다음에 확인해보기
                pass
            else:
                csvpath = glob(i + '/*.csv')[0]
                imgpath = glob(i + '/*.jpg')[0]
                jsonpath = glob(i + '/*.json')[0]
                # con = DataController()
                df = self.road_csv(csvpath,timenum)
                df2 = self.scaling(self.csvfeaturedict,df).to_numpy().reshape(-1,timenum,featnum)
                imgdata = self.getimage(imgpath).reshape(-1,imagesize,imagesize,3)
                label = label_encoder[self.getlable(jsonpath)]
                # label = self.getlable(jsonpath)
                
                csvarr = np.append(csvarr,df2, axis = 0)
                imgarr = np.append(imgarr,imgdata, axis = 0)
                lablearr = np.append(lablearr,label)
            
        return [csvarr,imgarr],lablearr
    
    def set_allfilelist_stratified_by_gruop(self,datapath):
        #empty list of label
        lablearr = np.array([])
        dic_by_group = {}
        
        #loop to make whole array of label
        for ind,i in enumerate(datapath):
            
             if glob(i + '/*.json') == []:  #10462폴더 비어있음. 다음에 확인해보기
                pass
            
             else:
                jsonpath = glob(i + '/*.json')[0]
                label = label_encoder[self.getlable(jsonpath)]
                
                # lablearr = np.append(lablearr,label)
                labelstr = "%s"%label
                try: dic_by_group[labelstr].append(i)
                except: dic_by_group[labelstr] = [i]
                
               
        return dic_by_group
    
    def get_pathlist_stratified_dataset(self,allpath,stratified_flies_dict,sample_num_of_trainset):
        files_for_dataset = {'train':[], 'test':[]}
        dataset_ratio = sample_num_of_trainset/len(allpath)
        
        for key, value in stratified_flies_dict.items():
            
          if len(value) == 1:
              files_for_dataset['train'].append(value[0])
              
              
          else:
              train_num_of_group = int(np.round(len(value)*dataset_ratio))
              
              train_num = [True]*train_num_of_group
              test_num = [False]*(len(value)-train_num_of_group)
              set_divide_index = train_num+test_num
              random.shuffle(set_divide_index)
              
              for ind,i in enumerate(set_divide_index):
                  if i:
                      files_for_dataset['train'].append(value[ind])
                  else:
                      files_for_dataset['test'].append(value[ind])
                      
        return files_for_dataset
                      
              
              
                
                

In [191]:
###############parameters######################
###############################################
#프로젝트에 있는 모든 데이터폴터 불러오기
# 분석에 사용할 feature 선택
csv_features = ['내부 온도 1 평균', '내부 온도 1 최고', '내부 온도 1 최저', '내부 습도 1 평균', '내부 습도 1 최고', 
                '내부 습도 1 최저', '내부 이슬점 평균', '내부 이슬점 최고', '내부 이슬점 최저']

# csv_files = sorted(glob('sample_data/*/*.csv'))
allfile = glob(path + '\\sample_data\\sample_data\\*\\*.csv')
csv_files = sorted(allfile)

temp_csv = pd.read_csv(csv_files[0])[csv_features]
max_arr, min_arr = temp_csv.max().to_numpy(), temp_csv.min().to_numpy()

# feature 별 최대값, 최솟값 계산
for csv in tqdm(csv_files[1:]):
    temp_csv = pd.read_csv(csv)[csv_features]
    temp_max, temp_min = temp_csv.max().to_numpy(), temp_csv.min().to_numpy()
    max_arr = np.max([max_arr,temp_max], axis=0)
    min_arr = np.min([min_arr,temp_min], axis=0)

# feature 별 최대값, 최솟값 dictionary 생성
csv_feature_dict = {csv_features[i]:[min_arr[i], max_arr[i]] for i in range(len(csv_features))}
csv_feature_dict
#
time_lag = 260
number_of_feature = 9
number_of_trainset = 400

##################makedataset###################
################################################
# 데이터 컨트롤러
dacon = DataController(csv_features,csv_feature_dict)
##############train, test file List############

###1. not stratified
# data_files_list = glob(path + '\\sample_data\\sample_data\\*')
# #셔플
# random.shuffle(data_files_list)
# #앞에서 300번째까지 트레인셋으로
# trainfiles = data_files_list[:number_of_trainset]
# #나머지는 테스트셋으로
# testfiles = data_files_list[number_of_trainset:]

###2. stratified
data_files_list = glob(path + '\\sample_data\\sample_data\\*')
tempset = dacon.set_allfilelist_stratified_by_gruop(data_files_list)
tempdic = dacon.get_pathlist_stratified_dataset(data_files_list,tempset,number_of_trainset)
trainfiles = tempdic['train']
testfiles = tempdic['test']

###set dataset
x_train,y_train = dacon.getdata(trainfiles,time_lag,number_of_feature)
x_test,y_test = dacon.getdata(testfiles,time_lag,number_of_feature)


100%|██████████| 498/498 [00:01<00:00, 259.31it/s]


In [226]:
print("데이터셋 그룹별 개수")
for i,value in tempset.items():
    print("group "+i+" - "+label_decoder[int(i)]+" - "+label_description[label_decoder[int(i)]]+" :",len(value))

데이터셋 그룹별 개수
group 19 - 6_00_0 - 시설포도_정상 : 196
group 0 - 3_00_0 - 파프리카_정상 : 169
group 1 - 3_a9_1 - 파프리카흰가루병_초기 : 47
group 2 - 3_a9_2 - 파프리카흰가루병_중기 : 35
group 24 - 6_a12_2 - 시설포도노균병_중기 : 13
group 26 - 6_b4_1 - 일소피해_초기 : 3
group 3 - 3_a9_3 - 파프리카흰가루병_말기 : 12
group 23 - 6_a12_1 - 시설포도노균병_초기 : 3
group 20 - 6_a11_1 - 시설포도탄저병_초기 : 10
group 4 - 3_a10_1 - 파프리카잘록병_초기 : 1
group 6 - 3_a10_3 - 파프리카잘록병_말기 : 3
group 28 - 6_b4_3 - 일소피해_말기 : 3
group 29 - 6_b5_1 - 축과병_초기 : 2
group 21 - 6_a11_2 - 시설포도탄저병_중기 : 1
group 5 - 3_a10_2 - 파프리카잘록병_중기 : 1


In [197]:
class Eencoder(keras.layers.Layer):
    def __init__(self):
        super(Eencoder, self).__init__()
        self.block1_layer1 = tf.keras.layers.Conv1D(9, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer2 = tf.keras.layers.Conv1D(18, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer3 = tf.keras.layers.Conv1D(36, 100, activation='relu',)#input_shape=input_shape[1:])
         
    def call(self, inputs):
        #LSTM파트
        lstm_x = self.block1_layer1(inputs)
        lstm_x = tf.nn.relu(lstm_x)
        lstm_x = self.block1_layer2(lstm_x)
        lstm_x = tf.nn.relu(lstm_x)
        lstm_x = self.block1_layer3(lstm_x)
        lstm_x = tf.nn.relu(lstm_x)
        
        return lstm_x
class Ddecoder(keras.layers.Layer):
    def __init__(self):
        super(Ddecoder, self).__init__()
        self.block1_layer1 = tf.keras.layers.Conv1DTranspose(3, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer2 = tf.keras.layers.Conv1DTranspose(6, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer3 = tf.keras.layers.Conv1DTranspose(9, 100, activation='relu',)#input_shape=input_shape[1:])
        
         
    def call(self, inputs):
        #LSTM파트
        x = self.block1_layer1(inputs)
        
        x = tf.nn.relu(x)
        x = self.block1_layer2(x)
        
        x = tf.nn.relu(x)
        x = self.block1_layer3(x)
       
        return x

# Eencoder()(x_train[0][:2,:,:]).shape
# Ddecoder()(Eencoder()(x_train[0][:2,:,:]))

class Autoencoder(tf.keras.Model): 
  def __init__(self,): 
    super(Autoencoder, self).__init__() 
    self.encoder = Eencoder() 
    self.decoder = Ddecoder() 
  
  def call(self, input): 
    code = self.encoder(input) 
    reconstructed = self.decoder(code) 
    return reconstructed

def loss(model, original): 
  reconstruction_error = tf.reduce_mean(tf.square(tf.subtract(model(original), original))) 
  return reconstruction_error

def train(loss, model, opt, original): 
  with tf.GradientTape() as tape: 
    gradients = tape.gradient(loss(model, original), model.trainable_variables) 
    gradient_variables = zip(gradients, model.trainable_variables) 
    opt.apply_gradients(gradient_variables)



In [201]:
automodel = Autoencoder()
opt = tf.optimizers.Adam()
loss_fn = keras.losses.MeanSquaredError()
automodel.compile(optimizer=opt, loss=loss_fn)
automodel.fit(x_train[0], x_train[0], 
                 batch_size=100, 
                 epochs=50,
                )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1de8813ffd0>

In [202]:
y_pred = automodel(x_test[0])
print("testset에 오토인코더 손실함수 테스트: ",loss_fn(x_test[0],y_pred))

testset에 오토인코더 손실함수 테스트:  tf.Tensor(0.029283952, shape=(), dtype=float32)


In [203]:
class preprmodel(keras.Model):
    def __init__(self, imgmodel,autoencoder,name = None):
        super(preprmodel, self).__init__()
        self.imgmodel = imgmodel
        self.autoencoder = autoencoder

    def call(self, inputs):
        x1 = self.imgmodel(inputs[1])
        x2 = self.autoencoder.encoder(inputs[0]).numpy().reshape(-1,36)


        x = tf.concat([x1,x2],axis=1)
        return(x)
model_RESNET50 = ResNet50(weights='imagenet')
prepromodel = preprmodel(model_RESNET50,automodel)
x_prime_train = prepromodel(x_train)
x_prime_test = prepromodel(x_test)

In [204]:
xgb = XGBClassifier(random_state=100, subsample= 0.7, colsample_bytree=0.7, scale_pos_weight=0.2)
xgb.fit(x_prime_train,y_train)

XGBClassifier(colsample_bytree=0.7, objective='multi:softprob',
              random_state=100, scale_pos_weight=0.2, subsample=0.7)

In [205]:
y_pred=xgb.predict(x_prime_test)
answer = np.array([label_description[label_decoder[int(val)]] for val in y_test])
predss = np.array([label_description[label_decoder[int(val)]] for val in y_pred])

new_crosstab = pd.crosstab(answer, predss, rownames=['answer'], colnames=['preds'])
new_crosstab

preds,시설포도_정상,시설포도노균병_중기,시설포도탄저병_초기,일소피해_말기,일소피해_초기,파프리카_정상,파프리카흰가루병_말기,파프리카흰가루병_중기,파프리카흰가루병_초기
answer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
시설포도_정상,39,0,0,0,0,0,0,0,0
시설포도노균병_중기,0,3,0,0,0,0,0,0,0
시설포도노균병_초기,0,0,0,0,0,0,0,1,0
시설포도탄저병_초기,0,0,2,0,0,0,0,0,0
일소피해_말기,0,0,0,1,0,0,0,0,0
일소피해_초기,0,0,0,0,1,0,0,0,0
파프리카_정상,0,0,0,0,0,34,0,0,0
파프리카잘록병_말기,1,0,0,0,0,0,0,0,0
파프리카흰가루병_말기,0,0,0,0,0,0,1,0,1
파프리카흰가루병_중기,0,0,0,0,0,0,0,0,7


In [206]:
from sklearn import metrics

# Print the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test, y_pred, digits=3))

[[34  0  0  0  0  0  0  0  0  0  0]
 [ 0  6  3  0  0  0  0  0  0  0  0]
 [ 0  7  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0 39  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  1]]
              precision    recall  f1-score   support

         0.0      1.000     1.000     1.000        34
         1.0      0.429     0.667     0.522         9
         2.0      0.000     0.000     0.000         7
         3.0      1.000     0.500     0.667         2
         6.0      0.000     0.000     0.000         1
        19.0      0.975     1.000     0.987        39
        20.0      1.000     1.000     1.000         2
        23.0      0.000     0.000     0.000         1
        24.0      1.000     1.000     1.000         3
        26.0      1.000     1.000     1.000         1
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
