In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import pandas as pd

# data path
data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')


In [None]:
train.shape, test.shape


In [None]:
train.head()


In [None]:
test.head()


In [None]:
submission.head()


In [None]:
# extraction by target type
healthy = train.loc[train['healthy']==1] # DataFame 
multiple_diseases = train.loc[train['multiple_diseases']==1]
rust = train.loc[train['rust']==1]
scab = train.loc[train['scab']==1]


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# rc is used for set viual envrionments
mpl.rc('font', size=15)
plt.figure(figsize=(7,7))

label = ['healty', 'multiple diseases', 'rust', 'scab']
plt.pie([len(healthy), len(multiple_diseases), len(rust), len(scab)],labels=label, autopct='%.1f%%')


In [None]:
import matplotlib.gridspec as gridspec
import cv2

def show_image(img_ids, rows=2, cols=3):
    assert len(img_ids) <= rows*cols 
    
    plt.figure(figsize=(15,8))
    grid = gridspec.GridSpec(rows, cols)
    
    for idx, img_id in enumerate(img_ids):
        img_path = f'{data_path}/images/{img_id}.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # convert color formate for cv2
        ax = plt.subplot(grid[idx])
        ax.imshow(image)


In [None]:
healthy


In [None]:
num_of_imgs = 6
last_healthy_img_ids = healthy['image_id'][-num_of_imgs:] # 마지막 6개에 대한 sereis 객체 반환
last_multiple_diseases_img_ids = multiple_diseases['image_id'][-num_of_imgs:]
last_rust_img_ids = rust['image_id'][-num_of_imgs:]
last_scab_img_ids = scab['image_id'][-num_of_imgs:]


In [None]:
show_image(last_healthy_img_ids)


In [None]:
show_image(last_multiple_diseases_img_ids)


In [None]:
show_image(last_rust_img_ids)


In [None]:
show_image(last_scab_img_ids)


In [None]:
import torch
import random
import numpy as np
import os

# fix seed
seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed) # ?? 추가 설명 필요
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


In [None]:
from sklearn.model_selection import train_test_split # DataLoader에 쓸게 아니라면 sklearn을 쓰면 된다.

train, valid = train_test_split(train, 
                                test_size = 0.1,
                                stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']], # 이 부분이 해당 열을 비율대로 뽑는거??
                                random_state=50)


In [None]:
import cv2
from torch.utils.data import Dataset
import numpy as np

class ImageDataset(Dataset):
    def __init__(self, df, img_dir=',/', transform=None, is_test=False):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0] # .loc 과 iloc의 차이는?
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            image = self.transform(image=image)['image']
            
        if self.is_test:
            return image
        else:
            label = np.argmax(self.df.iloc[idx,1:5]) # ???
            return image, label


In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2


In [None]:
transform_train = A.Compose([
    A.Resize(450, 650), # 상위권들의 토론글을 보면 이미지를 크게 조정할수록 성능이 좋다고 함. (원래는 800x800 정도로 잡는다고 함)
    A.RandomBrightnessContrast(brightness_limit=0.2,     # 밝기 대비 조절을 통해 빛의 양이 다른 환경에서도 일반적으로 인실할 수 있도록 만듬.
                              contrast_limit=0.2, p=0.3), # ? 나중에 일일히 찾아보도록 하자.
    A.VerticalFlip(p=0.2),
    A.HorizontalFlip(p=0.5), 
    A.ShiftScaleRotate(         # 이동, 스케일링, 회전 변환
        shift_limit=0.1,
        scale_limit=0.2,
        rotate_limit=30, p=0.3),
    A.OneOf([A.Emboss(p=1),    # 양가화, 날카로움, 블러 효과
             A.Sharpen(p=1),
             A.Blur(p=1)], p=0.3),    
    A.PiecewiseAffine(p=0.3),
    A.Normalize(),
    ToTensorV2()
]) # 어떻게 처리하면 더 좋은 성능을 이끌어낼 수 있는지를 고민해보면 좋다고 함.


In [None]:
transform_test = A.Compose([
    A.Resize(450, 650),
    A.Normalize(), # 범위를 비슷하게 잡아줘야 비교하기 편함 (값이 작아야 활성화함수에 가서 학습되지 쉽)
    ToTensorV2() # 파이토치는 텐서 객체만 취급하므로 변환기에 꼭 필
])


In [None]:
img_dir = '/kaggle/input/plant-pathology-2020-fgvc7/images/'

dataset_train = ImageDataset(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataset(valid, img_dir=img_dir, transform=transform_test)


In [None]:
# 멀티 프로세싱 관련 설정이라는데 자세한건 코딩 다 하고 나서 알아보자.
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)


In [None]:
from torch.utils.data import DataLoader

batch_size = 4 # 훈련 데이터가 많이 않아서 작게 설정. 질문: batch size가 크면 클수록 좋은거 아닌가?? 어느정도 있어야 일반적인 경향으로 학습되지 않나?

loader_train = DataLoader(dataset_train, batch_size=batch_size,
                        shuffle=True, worker_init_fn=seed_worker,
                        generator=g, num_workers=2)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size,
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)


In [None]:
!pip install efficientnet-pytorch==0.7.1


In [None]:
from efficientnet_pytorch import EfficientNet


In [None]:
# model = EfficientNet.from_pretrained('efficientnet-b7', num_classes=4)
# model = model.to(device)


In [None]:
import torch.nn as nn

model = EfficientNet.from_pretrained('efficientnet-b0')
model._fc = nn.Sequential(
    nn.Linear(model._fc.in_features, model._fc.out_features),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(model._fc.out_features, 4)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)


In [None]:
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

epochs = 5

model = model.to(device)

for epoch in range(epochs):
    model.train()
    epoch_train_loss = 0
    
    for images, labels in tqdm(loader_train):
        images = images.to(device); labels = labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad() # 업데이트 될 가중치값을 0으로 초기화
        loss.backward()  # 미분값 계산
        optimizer.step() # 업데이트 적용
        
        epoch_train_loss += loss.item() * images.shape[0]
        
    print(f'epoch [{epoch+1}/{epochs}] - train_loss: {epoch_train_loss/len(loader_train.dataset):.4f}') # 매 에포크마다 성능 비교
    
    
    model.eval()
    epoch_valid_loss = 0
    preds_list = []
    true_onehot_list = []
    
    with torch.no_grad():
        for images, labels in loader_valid:
            images = images.to(device); labels = labels.to(device)
            outputs = model(images)
            
            loss = criterion(outputs, labels)
            
            preds = torch.softmax(outputs.cpu(), dim=1).numpy()
            true_onehot = torch.eye(4)[labels].cpu().numpy()
            preds_list.extend(preds)
            true_onehot_list.extend(true_onehot)
            
            epoch_valid_loss += loss.item() * images.shape[0]
            
    print(f'epoch [{epoch+1}/{epochs}] - val_loss: {epoch_valid_loss/len(loader_valid):.4f}/ val_ROC AUC: {roc_auc_score(true_onehot_list,preds_list):.4f}')
            
        


In [None]:
dataset_test = ImageDataset(test, img_dir=img_dir,
                           transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size,
                        shuffle=False, worker_init_fn=seed_worker,
                        generator=g, num_workers=2)


In [None]:
model.eval()

preds = np.zeros((len(test), 4))

with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds[i*batch_size:(i+1)*batch_size] += preds_part


In [None]:
submission[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds
submission.to_csv('submittion.csv', index=False)
