# Resnet50 + autoencoder로 각각 이미지와 시계열 셋을 전처리한 후 XGB로 파인튠하여 최종분류

*해결과제
1. 10번쨰열 결측치 처리
2. 데이터 중복시간있는것 처리 -> 10번째열 있는 자료만 쓰려고 하였으나 10번쨰열이 없는 데이터도 상당히 존재함
3. 1~9 -> 10 하는 간단한 모델 적합 후 결측치 처리하는 방향으로 고려중

In [2]:
#import
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
from glob import glob
import os
import json 
import random
from xgboost import XGBClassifier
from tensorflow.keras.applications.resnet import ResNet50
import tensorflow as tf
from tensorflow import keras
from sklearn import metrics


In [3]:
path = os.getcwd()

In [5]:
# 제공된 sample data는 파프리카와 시설포도 2종류의 작물만 존재
label_description = {
 '3_00_0': '파프리카_정상',
 '3_a9_1': '파프리카흰가루병_초기',
 '3_a9_2': '파프리카흰가루병_중기',
 '3_a9_3': '파프리카흰가루병_말기',
 '3_a10_1': '파프리카잘록병_초기',
 '3_a10_2': '파프리카잘록병_중기',
 '3_a10_3': '파프리카잘록병_말기',
 '3_b3_1': '칼슘결핍_초기',
 '3_b3_2': '칼슘결핍_중기',
 '3_b3_3': '칼슘결핍_말기',
 '3_b6_1': '다량원소결핍 (N)_초기',
 '3_b6_2': '다량원소결핍 (N)_중기',
 '3_b6_3': '다량원소결핍 (N)_말기',
 '3_b7_1': '다량원소결핍 (P)_초기',
 '3_b7_2': '다량원소결핍 (P)_중기',
 '3_b7_3': '다량원소결핍 (P)_말기',
 '3_b8_1': '다량원소결핍 (K)_초기',
 '3_b8_2': '다량원소결핍 (K)_중기',
 '3_b8_3': '다량원소결핍 (K)_말기',
 '6_00_0': '시설포도_정상',
 '6_a11_1': '시설포도탄저병_초기',
 '6_a11_2': '시설포도탄저병_중기',
 '6_a11_3': '시설포도탄저병_말기',
 '6_a12_1': '시설포도노균병_초기',
 '6_a12_2': '시설포도노균병_중기',
 '6_a12_3': '시설포도노균병_말기',
 '6_b4_1': '일소피해_초기',
 '6_b4_2': '일소피해_중기',
 '6_b4_3': '일소피해_말기',
 '6_b5_1': '축과병_초기',
 '6_b5_2': '축과병_중기',
 '6_b5_3': '축과병_말기',
}

label_encoder = {key:idx for idx, key in enumerate(label_description)}
label_decoder = {val:key for key, val in label_encoder.items()}

In [6]:
class DataController():
    def __init__(self,csvfeatures,csvfeaturedict):
        self.csv_features = csvfeatures
        self.csv_feature_dict = csvfeaturedict
    
    def road_csv(self,foldnam,timenum):
        df = pd.read_csv(foldnam)
        return df.loc[:timenum-1,self.csv_features]      #csv파일 제일 짧은게 291개임 오류인지는 모르겠으나 일단 최소길이로 통일하여 처리
    
    def scaling(self,minmaxdic,df):
        for col in minmaxdic.keys():
            df.loc[:,col] = df.loc[:,col] - csv_feature_dict[col][0]
            df.loc[:,col] = df.loc[:,col] / (csv_feature_dict[col][1]-csv_feature_dict[col][0])
        return df
    
    
    def getimage(self,imgpath):
        img = cv2.imread(imgpath)
        img = cv2.resize(img, dsize=(224, 224), interpolation=cv2.INTER_AREA)
        img = img.astype(np.float32)/255  ##픽셀값을 0~1사이로 정규화
        # img = np.transpose(img, (2,0,1))
        return img
    
    def getlable(self,jsonpath):
        with open(jsonpath, 'r') as f:
            json_file = json.load(f)

        crop = json_file['annotations']['crop']
        disease = json_file['annotations']['disease']
        risk = json_file['annotations']['risk']
        label = f'{crop}_{disease}_{risk}'
        return label
    
    def getdata(self,datapath,timenum,featnum):

        csvarr = np.empty((0,timenum,featnum), float)
        imgarr = np.empty((0,224,224,3), float)
        lablearr = np.array([])
        
        # predictor = np.append(predictor,ndf.reshape(-1,timenum,featnum),axis = 0)
        
        for ind,i in enumerate(datapath):
            
            if glob(i + '/*.csv') == []:  #10462폴더 비어있음. 다음에 확인해보기
                pass
            else:
                csvpath = glob(i + '/*.csv')[0]
                imgpath = glob(i + '/*.jpg')[0]
                jsonpath = glob(i + '/*.json')[0]
                # con = DataController()
                df = self.road_csv(csvpath,timenum)
                df2 = self.scaling(csv_feature_dict,df).to_numpy().reshape(-1,timenum,featnum)
                imgdata = self.getimage(imgpath).reshape(-1,224,224,3)
                label = label_encoder[self.getlable(jsonpath)]
                # label = self.getlable(jsonpath)
                
                csvarr = np.append(csvarr,df2, axis = 0)
                imgarr = np.append(imgarr,imgdata, axis = 0)
                lablearr = np.append(lablearr,label)
            
        return [csvarr,imgarr],lablearr
            
        
        
        
          

In [7]:
###############parameters######################
###############################################
#프로젝트에 있는 모든 데이터폴터 불러오기
# 분석에 사용할 feature 선택
csv_features = ['내부 온도 1 평균', '내부 온도 1 최고', '내부 온도 1 최저', '내부 습도 1 평균', '내부 습도 1 최고', 
                '내부 습도 1 최저', '내부 이슬점 평균', '내부 이슬점 최고', '내부 이슬점 최저']

# csv_files = sorted(glob('sample_data/*/*.csv'))
allfile = glob(path + '\\sample_data\\sample_data\\*\\*.csv')
csv_files = sorted(allfile)

temp_csv = pd.read_csv(csv_files[0])[csv_features]
max_arr, min_arr = temp_csv.max().to_numpy(), temp_csv.min().to_numpy()

# feature 별 최대값, 최솟값 계산
for csv in tqdm(csv_files[1:]):
    temp_csv = pd.read_csv(csv)[csv_features]
    temp_max, temp_min = temp_csv.max().to_numpy(), temp_csv.min().to_numpy()
    max_arr = np.max([max_arr,temp_max], axis=0)
    min_arr = np.min([min_arr,temp_min], axis=0)

# feature 별 최대값, 최솟값 dictionary 생성
csv_feature_dict = {csv_features[i]:[min_arr[i], max_arr[i]] for i in range(len(csv_features))}
csv_feature_dict
#
time_lag = 260
number_of_feature = 9
number_of_trainset = 400

##################makedataset###################
################################################

#path + '\\sample_data\\sample_data\\*\\*.csv'
data_files_list = glob(path + '\\sample_data\\sample_data\\*')
#셔플
random.shuffle(data_files_list)
#앞에서 300번째까지 트레인셋으로
trainfiles = data_files_list[:number_of_trainset]
#나머지는 테스트셋으로
testfiles = data_files_list[number_of_trainset:]

# 데이터 컨트롤러
dacon = DataController(csv_features,csv_feature_dict)
# 배치화된 데이터셋 만들기
# test셋용으로는 train = False로 하여 배치안생성하게됨
x_train,y_train = dacon.getdata(trainfiles,time_lag,number_of_feature)
x_test,y_test = dacon.getdata(testfiles,time_lag,number_of_feature)


In [13]:
class Eencoder(keras.layers.Layer):
    def __init__(self):
        super(Eencoder, self).__init__()
        self.block1_layer1 = tf.keras.layers.Conv1D(9, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer2 = tf.keras.layers.Conv1D(18, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer3 = tf.keras.layers.Conv1D(36, 100, activation='relu',)#input_shape=input_shape[1:])
         
    def call(self, inputs):
        #LSTM파트
        lstm_x = self.block1_layer1(inputs)
        lstm_x = tf.nn.relu(lstm_x)
        lstm_x = self.block1_layer2(lstm_x)
        lstm_x = tf.nn.relu(lstm_x)
        lstm_x = self.block1_layer3(lstm_x)
        lstm_x = tf.nn.relu(lstm_x)
        
        return lstm_x
class Ddecoder(keras.layers.Layer):
    def __init__(self):
        super(Ddecoder, self).__init__()
        self.block1_layer1 = tf.keras.layers.Conv1DTranspose(3, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer2 = tf.keras.layers.Conv1DTranspose(6, 81, activation='relu',)#input_shape=input_shape[1:])
        self.block1_layer3 = tf.keras.layers.Conv1DTranspose(9, 100, activation='relu',)#input_shape=input_shape[1:])
        
         
    def call(self, inputs):
        #LSTM파트
        x = self.block1_layer1(inputs)
        
        x = tf.nn.relu(x)
        x = self.block1_layer2(x)
        
        x = tf.nn.relu(x)
        x = self.block1_layer3(x)
       
        return x

# Eencoder()(x_train[0][:2,:,:]).shape
# Ddecoder()(Eencoder()(x_train[0][:2,:,:]))

class Autoencoder(tf.keras.Model): 
  def __init__(self,): 
    super(Autoencoder, self).__init__() 
    self.encoder = Eencoder() 
    self.decoder = Ddecoder() 
  
  def call(self, input): 
    code = self.encoder(input) 
    reconstructed = self.decoder(code) 
    return reconstructed

def loss(model, original): 
  reconstruction_error = tf.reduce_mean(tf.square(tf.subtract(model(original), original))) 
  return reconstruction_error

def train(loss, model, opt, original): 
  with tf.GradientTape() as tape: 
    gradients = tape.gradient(loss(model, original), model.trainable_variables) 
    gradient_variables = zip(gradients, model.trainable_variables) 
    opt.apply_gradients(gradient_variables)



In [14]:
automodel = Autoencoder()
opt = tf.optimizers.Adam()
loss_fn = keras.losses.MeanSquaredError()
automodel.compile(optimizer=opt, loss=loss_fn)
automodel.fit(x_train[0], x_train[0], 
                 batch_size=100, 
                 epochs=20,
                )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a92a1f1940>

In [15]:
y_pred = automodel(x_test[0])
print("testset에 오토인코더 손실함수 테스트: ",loss_fn(x_test[0],y_pred))

testset에 오토인코더 손실함수 테스트:  tf.Tensor(0.032171927, shape=(), dtype=float32)


In [16]:
class preprmodel(keras.Model):
    def __init__(self, imgmodel,autoencoder,name = None):
        super(preprmodel, self).__init__()
        self.imgmodel = imgmodel
        self.autoencoder = autoencoder

    def call(self, inputs):
        x1 = self.imgmodel(inputs[1])
        x2 = self.autoencoder.encoder(inputs[0]).numpy().reshape(-1,36)


        x = tf.concat([x1,x2],axis=1)
        return(x)
model_RESNET50 = ResNet50(weights='imagenet')
prepromodel = preprmodel(model_RESNET50,automodel)
x_prime_train = prepromodel(x_train)
x_prime_test = prepromodel(x_test)

In [21]:
xgb = XGBClassifier(random_state=100, subsample= 0.7, colsample_bytree=0.7, scale_pos_weight=0.2)
xgb.fit(x_prime_train,y_train)

XGBClassifier(colsample_bytree=0.7, objective='multi:softprob',
              random_state=100, scale_pos_weight=0.2, subsample=0.7)

In [22]:
y_pred=xgb.predict(x_prime_test)
answer = np.array([label_description[label_decoder[int(val)]] for val in y_test])
predss = np.array([label_description[label_decoder[int(val)]] for val in y_pred])

new_crosstab = pd.crosstab(answer, predss, rownames=['answer'], colnames=['preds'])
new_crosstab

preds,시설포도_정상,시설포도노균병_중기,시설포도탄저병_초기,일소피해_말기,파프리카_정상,파프리카흰가루병_말기,파프리카흰가루병_중기,파프리카흰가루병_초기
answer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
시설포도_정상,36,0,0,0,0,0,0,0
시설포도노균병_중기,0,3,0,0,0,0,0,0
시설포도탄저병_초기,0,0,3,0,0,0,0,0
일소피해_말기,0,0,0,1,0,0,0,0
축과병_초기,1,0,0,0,0,0,0,0
파프리카_정상,0,0,0,0,37,0,0,0
파프리카흰가루병_말기,1,0,0,0,0,0,1,0
파프리카흰가루병_중기,0,0,0,0,0,1,2,2
파프리카흰가루병_초기,1,0,0,0,1,0,5,5


In [23]:
from sklearn import metrics

# Print the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test, y_pred, digits=3))

[[37  0  0  0  0  0  0  0  0]
 [ 1  5  5  0  1  0  0  0  0]
 [ 0  2  2  1  0  0  0  0  0]
 [ 0  0  1  0  1  0  0  0  0]
 [ 0  0  0  0 36  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  1  0  0  0  0]]
              precision    recall  f1-score   support

         0.0      0.974     1.000     0.987        37
         1.0      0.714     0.417     0.526        12
         2.0      0.250     0.400     0.308         5
         3.0      0.000     0.000     0.000         2
        19.0      0.923     1.000     0.960        36
        20.0      1.000     1.000     1.000         3
        24.0      1.000     1.000     1.000         3
        28.0      1.000     1.000     1.000         1
        29.0      0.000     0.000     0.000         1

    accuracy                          0.870       100
   macro avg      0.651     0.646     0.642       100
weighted avg      0.861     0.870     0.859       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
