# 대스타

모델 확인

### Library

In [None]:
import os.path
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns

from pathlib import Path
from tqdm import tqdm
from time import perf_counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from IPython.display import Markdown, display

import zipfile
import gdown

## train

### train Data Dwonload

In [None]:
google_path = 'https://drive.google.com/uc?id='
file_id = '1bXqNgPQVaDdRHEaFPBsHOq2iUsBglGNK'
output_name = 'train.zip'
gdown.download(google_path+file_id,output_name)

Downloading...
From: https://drive.google.com/uc?id=1bXqNgPQVaDdRHEaFPBsHOq2iUsBglGNK
To: /content/train.zip
100%|██████████| 1.37G/1.37G [00:09<00:00, 140MB/s]


'train.zip'

In [None]:
input_path = './train.zip'
output_path = './data'

In [None]:
zip_data = zipfile.ZipFile(input_path)
zip_data.extractall(output_path)

In [None]:
dir_ = Path('/content/data/re_train')
filepaths = list(dir_.glob(r'**/*.jpg'))

### Data Prcessing

In [None]:
def proc_img(filepath):
    """
   		이미지데이터의 경로와 label데이터로 데이터프레임 만들기 
    """

    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # 경로와 라벨 concatenate
    df = pd.concat([filepath, labels], axis=1)

    # index 재설정
    df = df.sample(frac=1,random_state=0).reset_index(drop = True)
    
    return df

In [None]:
df = proc_img(filepaths)

### train, test split

In [None]:
# Training/test split
# train_df,test_df = train_test_split(df.sample(frac=0.2), test_size=0.1,random_state=0) #모델링 시간이 오래걸리면 사용
train_df,test_df = train_test_split(df, test_size=0.1,random_state=0)
train_df.shape,test_df.shape

((38908, 2), (4324, 2))

In [None]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(validation_split=0.2)

train_gen = train_datagen.flow_from_directory('/content/data/re_train',
                                                 target_size = (256, 256),
                                                 batch_size = 32,
                                                 class_mode = 'categorical',subset='training')
val_gen  = train_datagen.flow_from_directory('/content/data/re_train',
                                                 target_size = (256, 256),
                                                 batch_size = 32,
                                                 class_mode = 'categorical',subset='validation')

Found 34704 images belonging to 309 classes.
Found 8528 images belonging to 309 classes.


### CNN 정의

In [None]:
# Initialising the CNN
cnn = tf.keras.models.Sequential()

# Step 1 - Convolution
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', input_shape=[256, 256, 3]))

# Step 2 - Pooling
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))

# Adding convolutional layer
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))

# Step 3 - Flattening
cnn.add(tf.keras.layers.Flatten())

# Step 4 - Full Connection
cnn.add(tf.keras.layers.Dense(units=128, activation='relu'))

# Step 5 - Output Layer
cnn.add(tf.keras.layers.Dense(units=309, activation='softmax'))

# Compiling the CNN
cnn.compile(optimizer = 'adam', 
            loss = 'categorical_crossentropy', 
            metrics = ['accuracy'])
cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 254, 254, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 127, 127, 32)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 125, 125, 32)      9248      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 62, 62, 32)       0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 123008)            0         
                                                                 
 dense (Dense)               (None, 128)               1

### CNN 학습

In [None]:
cnn.fit(x = train_gen, validation_data = val_gen, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f83d026d2d0>

### 모델 성능 개선

In [None]:
def create_gen():
    # 생성기 및 데이터 증강으로 이미지 로드
    train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        validation_split=0.1
    )

    test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath', # 파일위치 열이름
        y_col='Label', # 클래스 열이름
        target_size=(256, 256), # 이미지 사이즈
        color_mode='rgb', # 이미지 채널수
        class_mode='categorical', # Y값(Label값)
        batch_size=32,
        shuffle=True, # 데이터를 섞을지 여부
        seed=0,
        subset='training', # train 인지 val인지 설정
        fill_mode="nearest"
        # 이미지 변경시 보완 방법 (constant, nearest, reflect, wrap) 4개 존재
    )

    val_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(256, 256),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=True,
        seed=0,
        subset='validation',
        fill_mode="nearest"
    )

    test_images = test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(256, 256),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )
    
    return train_generator,test_generator,train_images,val_images,test_images

### 사전 학습 모델 사용 : ResNet50V2

In [None]:
models = {
    "ResNet50V2": {"model":tf.keras.applications.ResNet50V2, "perf":0}
}

In [None]:
# Create the generators
train_generator,test_generator,train_images,val_images,test_images=create_gen()

Found 35018 validated image filenames belonging to 309 classes.
Found 3890 validated image filenames belonging to 309 classes.
Found 4324 validated image filenames belonging to 309 classes.


In [None]:
def get_model(model):
# Load the pretained model
    kwargs =    {'input_shape':(256, 256, 3),
                'include_top':False,
                'weights':'imagenet',
                'pooling':'avg'}
    
    pretrained_model = model(**kwargs)
    pretrained_model.trainable = False # 레이어를 동결 시켜서 훈련중 손실을 최소화 한다.
    
    inputs = pretrained_model.input

    x = tf.keras.layers.Dense(128, activation='relu')(pretrained_model.output)
    x = tf.keras.layers.Dense(128, activation='relu')(x)

    outputs = tf.keras.layers.Dense(309, activation='softmax')(x)
    # 라벨 개수가 309개이기 때문에 Dencs도 309로 설정
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
# Train모델 학습
for name, model in models.items():
    
    # 전이 학습 모델 가져오기
    m = get_model(model['model'])
    models[name]['model'] = m
    
    start = perf_counter()
    
    # 모델 학습
    history = m.fit(train_images,validation_data=val_images,epochs=1,verbose=0)
    
    # 학습시간과 val_accuracy 저장
    duration = perf_counter() - start
    duration = round(duration,2)
    models[name]['perf'] = duration
    print(f"{name:20} trained in {duration} sec")
    
    val_acc = history.history['val_accuracy']
    models[name]['val_acc'] = [round(v,4) for v in val_acc]

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
ResNet50V2           trained in 145.31 sec


In [None]:
train_df,test_df = train_test_split(df, test_size=0.1, random_state=0)
train_generator,test_generator,train_images,val_images,test_images=create_gen()

model = get_model(tf.keras.applications.ResNet50V2)
history = model.fit(train_images,validation_data=val_images,epochs=10)

Found 35018 validated image filenames belonging to 309 classes.
Found 3890 validated image filenames belonging to 309 classes.
Found 4324 validated image filenames belonging to 309 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Predict the label of the test_images
pred = model.predict(test_images)
pred = np.argmax(pred,axis=1)

# Map the label
labels = (train_images.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred = [labels[k] for k in pred]

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))
    
y_test = list(test_df.Label)
acc = accuracy_score(y_test,pred)
printmd(f'# Accuracy on the test set: {acc * 100:.2f}%')

# Accuracy on the test set: 92.65%

In [None]:
model.save('my_model.h5')

## predict

### Answer Data Dwonload

In [None]:
google_path = 'https://drive.google.com/uc?id='
file_id = '1TCdM4b9DI_U7Z-Fyu263NVwEtaLdaCvb'
output_name = 'data.zip'
gdown.download(google_path+file_id,output_name)

Downloading...
From: https://drive.google.com/uc?id=1TCdM4b9DI_U7Z-Fyu263NVwEtaLdaCvb
To: /content/data.zip
100%|██████████| 2.70G/2.70G [00:44<00:00, 60.2MB/s]


'data.zip'

In [None]:
input_path = './data.zip'
output_path = './data'

In [None]:
zip_data = zipfile.ZipFile(input_path)
zip_data.extractall(output_path)

### answer 

In [None]:
#빈 DataFrame 생성
answer = pd.DataFrame()

answer

In [None]:
q_dir_ = Path('/content/data/questions')
q_filepaths = list(dir_.glob(r'**/*.jpg'))

t_dir_ = Path('/content/data/test')
t_filepaths = list(dir_.glob(r'**/*.jpg'))

In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input

In [None]:
IMAGE_SIZE    = (256, 256)
label = 0 #동일한 종이 아닌 경우 0

q_prediction = {}
t_prediction = {}

for i in q_filepaths:
  q_image = image.load_img(i, target_size =IMAGE_SIZE)
  q_image = image.img_to_array(q_image)
  q_image = q_image.reshape((1, q_image.shape[0], q_image.shape[1], q_image.shape[2]))
  q_image = preprocess_input(q_image)
  q_prediction[i] = np.argmax(model.predict(q_image), axis = 1)

  
for j in t_filepaths:
  t_image = image.load_img(j, target_size =IMAGE_SIZE)
  t_image = image.img_to_array(t_image)
  t_image = t_image.reshape((1, t_image.shape[0], t_image.shape[1], t_image.shape[2]))
  t_image = preprocess_input(t_image)
  t_prediction[j] = np.argmax(model.predict(t_image), axis = 1)

In [None]:
for i in q_filepaths:
  for j in t_filepaths:
    if q_prediction.get(i) == t_prediction.get(j) :
      label = 1
    else :
      label = 0 
    answer.append({'question': i, 'target': j,'label' : label}, ignore_index = True)

#predict_classes 참고 https://precommer.tistory.com/48

In [None]:
#209245 * 3 시트가 만들어져야 함
answer.to_csv('/content/data/answer.csv')