In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
'''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

"\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n"

In [2]:
#load train.csv

train_df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
train_df.head(5)

Unnamed: 0,image,labels
0,800113bb65efe69e.jpg,healthy
1,8002cb321f8bfcdf.jpg,scab frog_eye_leaf_spot complex
2,80070f7fb5e2ccaa.jpg,scab
3,80077517781fb94f.jpg,scab
4,800cbf0ff87721f8.jpg,complex


In [3]:
#num of classes 
classes = len(train_df['labels'].value_counts())
print(classes) #12

12


In [4]:
#문자 라벨을 숫자 라벨로 변환
#make string label to numeric label
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
label.fit(train_df['labels'])
train_df['label_id'] = label.transform(train_df['labels'])
label_dic = dict(sorted(train_df[['label_id', 'labels']].values.tolist())) #save for submission
del train_df['labels'] #we don't need this Series anymore

image_names = np.array(train_df['image'].values)
image_labels = np.array(train_df['label_id'].values)

print(image_names.shape) #18632

(18632,)


In [12]:
import matplotlib.pyplot as plt
from glob import glob
import cv2, torch
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

#커스텀 데이터셋 설정
#dataset and dataloader for train
class dataset(Dataset) :
    def __init__(self, image_list, image_names, image_labels, transform, device) :
        self.image_list = image_list
        self.image_names = image_names
        self.image_labels = image_labels
        self.transform = transform
    
    def __len__(self) :
        return len(self.image_list)
    
    def __getitem__(self, index) :
        x = cv2.imread(self.image_list[index])
        x = self.transform(x).to(device)
        
        image_name = image_list[index][49:]
        y = self.image_labels[np.where(image_names == image_name)]
        y = tensor.LongTensor([y,]).to(device)
        
        return x, y


#load train_images
image_list = glob('../input/plant-pathology-2021-fgvc8/train_images/*.jpg')


train_data = dataset(image_list, image_names, image_labels, transform, device)
train_data = DataLoader(train_data, batch_size = 32, shuffle = True) #not enough GPU-memory makes me set few batch_size

In [13]:
! pip install torchsummaryX

Collecting torchsummaryX
  Downloading torchsummaryX-1.3.0-py3-none-any.whl (3.6 kB)
Installing collected packages: torchsummaryX
Successfully installed torchsummaryX-1.3.0


In [15]:
import torch.nn as nn
from torch.optim import Adam
from torchvision.models import resnet34
from torchsummaryX import summary

class resnet(nn.Module) :
    def __init__(self, output) :
        super().__init__()
        self.model = resnet34(pretrained=False) #use Densenet
        self.model.fc = torch.nn.Linear(512, output) #change the last FC layer
        
    def forward(self, x) :
        output = self.model(x)
        return output

classes = 12
model = resnet(classes).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

summary(model, torch.rand((1, 3, 224, 224)).float().to(device))

AssertionError: Torch not compiled with CUDA enabled

In [None]:
n_epoch = 30
torch.cuda.empty_cache()

model = model.train()

for epoch in range(n_epoch) :
    epoch_loss = 0
    epoch_acc = 0
    for i, (x, y) in enumerate(train_data) :
        x = x.to(device)
        y = y.reshape(-1)
        y = y.to(device)
        
        predict = model(x)
        print(y.shape, predict.shape)
        loss = criterion(predict, y)
        
        epoch_loss += loss / len(train_data)
        correct_prediction = torch.argmax(predict, 1) == y
        correct_prediction = correct_prediction.sum()
        epoch_acc += correct_prediction
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    epoch_acc = epoch_acc / (16 * len(train_loader))
    print('Epoch : {}/{},   loss : {:.5f},    acc : {:.5f}'.format(epoch+1, n_epoch, epoch_loss, epoch_acc))

In [None]:
valid_src = '../input/plant-pathology-2021-fgvc8/test_images'
valid_image_list = glob(valid_src + '/*.jpg')
valid_x = torch.FloatTensor(np.zeros((3, 3, 224, 224))).to(device)

for i, image in tqdm(enumerate(valid_image_list)) :
    img = cv2.imread(image)
    img = transform(img)
    
    valid_x[i] = img.to(device)

print(valid_x.shape)


model.eval()
with torch.no_grad() : predict = model.predict(valid_x)

predict_list = []
for i, x in enumerate(predict) :
    x = np.array(x)
    x = np.argmax(x)
    
    predict_list.append([valid_image_list[i][49:], x])
    
pred_df = pd.DataFrame.from_records(predict_list, columns = ['image', 'label_id'])
pred_df['labels'] = pred_df['label_id'].map(class_map)
del pred_df['label_id']
pred_df.head(30)

pred_df.to_csv("submission.csv", index = False)