## Import

In [1]:
# !pip3 install albumentations ;!pip3 install opencv-python ; !pip3 install tqdm

In [2]:
import random
import pandas as pd
import numpy as np
import os
import re
import glob
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

# torch.multiprocessing import
from torch import multiprocessing

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
# 경로지정
import os
os.chdir('../DATA')
os.getcwd()

'/Users/admin/Documents/GitHub/Dacon_papering_classification/DATA'

In [4]:
# seeds
random_seed = 42
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)


In [5]:
# device 설정
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')
# mps 확인
torch.backends.mps.is_available()

True

## Hyperparameter Setting

In [6]:
# hyperparameter
CFG = {
    'IMG_SIZE':224,
    'EPOCHS':5,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':64,
    'SEED':2022
}

## Fixed RandomSeed

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [8]:
all_img_list = glob.glob('/train/*/*')

In [9]:
os.getcwd()

'/Users/admin/Documents/GitHub/Dacon_papering_classification/DATA'

In [10]:
ㅍ

In [11]:
all_img_list[:5]

['train/녹오염/8.png',
 'train/녹오염/9.png',
 'train/녹오염/12.png',
 'train/녹오염/13.png',
 'train/녹오염/11.png']

In [12]:
df = pd.DataFrame(columns=['img_path', 'label'])
df['img_path'] = all_img_list
df['label'] = df['img_path'].apply(lambda x : str(x).split('/')[1])
df

Unnamed: 0,img_path,label
0,train/녹오염/8.png,녹오염
1,train/녹오염/9.png,녹오염
2,train/녹오염/12.png,녹오염
3,train/녹오염/13.png,녹오염
4,train/녹오염/11.png,녹오염
...,...,...
3452,train/틈새과다/4.png,틈새과다
3453,train/틈새과다/2.png,틈새과다
3454,train/틈새과다/3.png,틈새과다
3455,train/틈새과다/1.png,틈새과다


In [13]:
df

Unnamed: 0,img_path,label
0,train/녹오염/8.png,녹오염
1,train/녹오염/9.png,녹오염
2,train/녹오염/12.png,녹오염
3,train/녹오염/13.png,녹오염
4,train/녹오염/11.png,녹오염
...,...,...
3452,train/틈새과다/4.png,틈새과다
3453,train/틈새과다/2.png,틈새과다
3454,train/틈새과다/3.png,틈새과다
3455,train/틈새과다/1.png,틈새과다


In [14]:
# train val split
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, stratify=df['label'], random_state=CFG['SEED'])


In [15]:
train

Unnamed: 0,img_path,label
1888,train/터짐/62.png,터짐
2018,train/터짐/151.png,터짐
1896,train/터짐/49.png,터짐
1395,train/오염/588.png,오염
957,train/면불량/56.png,면불량
...,...,...
2559,train/훼손/315.png,훼손
1146,train/곰팡이/22.png,곰팡이
511,train/꼬임/150.png,꼬임
3200,train/훼손/949.png,훼손


## Label-Encoding

In [16]:
le = preprocessing.LabelEncoder()
train['label'] = le.fit_transform(train['label'])
val['label'] = le.transform(val['label'])
train

Unnamed: 0,img_path,label
1888,train/터짐/62.png,15
2018,train/터짐/151.png,15
1896,train/터짐/49.png,15
1395,train/오염/588.png,10
957,train/면불량/56.png,6
...,...,...
2559,train/훼손/315.png,18
1146,train/곰팡이/22.png,2
511,train/꼬임/150.png,3
3200,train/훼손/949.png,18


In [17]:
# label-decoding
le.inverse_transform(train['label'])

array(['터짐', '터짐', '터짐', ..., '꼬임', '훼손', '울음'],
      dtype=object)

## CustomDataset

In [18]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, transforms=None):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.transforms = transforms
        
    def __getitem__(self, index):
        img_path = self.img_path_list[index]
        
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        if self.label_list is not None:
            label = self.label_list[index]
            return image, label
        else:
            return image
        
    def __len__(self):
        return len(self.img_path_list)

In [19]:
train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [20]:
#weighted sampling
a_list = train['label'].value_counts()
class_counts = train['label'].value_counts().to_list()
class_weights = [1 / a_list[i] for i in range(len(class_counts))]
num_samples = sum(class_counts)
labels_lists = train['label'].to_list()
weights = [class_weights[labels_lists[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples), replacement=True)
sampler

<torch.utils.data.sampler.WeightedRandomSampler at 0x159179610>

In [21]:
# make dataloader
train_dataset = CustomDataset(train['img_path'].values, train['label'].values, train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], sampler=sampler, shuffle=False, num_workers=0)

val_dataset = CustomDataset(val['img_path'].values, val['label'].values, test_transform)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha]*19)
        self.alpha[18] = 1-alpha
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

## Model Define

In [23]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=len(le.classes_)):
        super(BaseModel, self).__init__()
        self.backbone = models.efficientnet_b0(pretrained=True)
        self.classifier = nn.Linear(1000, num_classes)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.classifier(x)
        return x

## Train

In [24]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for imgs, labels in tqdm(iter(train_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(imgs)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val Weighted F1 Score : [{_val_score:.5f}]')
       
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_score < _val_score:
            best_score = _val_score
            best_model = model
    
    return best_model

In [25]:
## Train

def trains(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
#     criterion = nn.CrossEntropyLoss(label_smoothing=0.1).to(device)
    criterion = FocalLoss(gamma=2, alpha=0.25)
    
    best_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for imgs, labels in tqdm(iter(train_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(imgs)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val Weighted F1 Score : [{_val_score:.5f}]')
       
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_score < _val_score:
            best_score = _val_score
            best_model = model
    
    return best_model

In [26]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, true_labels = [], []

    with torch.no_grad():
        for imgs, labels in tqdm(iter(val_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            pred = model(imgs)
            
            loss = criterion(pred, labels)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += labels.detach().cpu().numpy().tolist()
            
            val_loss.append(loss.item())
        
        _val_loss = np.mean(val_loss)
        _val_score = f1_score(true_labels, preds, average='weighted')
    
    return _val_loss, _val_score

In [27]:
# #!/bin/bash
# # os.chdir('train')
# # 변경 전 폴더명 리스트
# old_names = ["녹오염", "걸레받이수정", "꼬임", "석고수정", "오타공", "울음", "이음부불량",
#              "몰딩수정", "면불량", "창틀,문틀수정", "피스", "곰팡이", "반점", "들뜸", "오염",
#              "가구수정", "터짐", "훼손", "틈새과다"]

# # 변경 후 폴더명 리스트
# new_names = ["rust", "mop_cleaning", "twist", "gypsum_repair", "typo", "cry", "splice_defect",
#              "molding_repair", "wrinkle", "window_frame_repair", "piece", "mold", "stain", 
#              "uneven", "contamination", "furniture_repair", "crack", "damage", "excessive_gap"]

# ch_names = os.listdir()
# # 현재 폴더 경로
# current_dir = os.getcwd()

# # 폴더명 변경
# for old_name, new_name in zip(ch_names, old_names):
#     old_path = old_name
#     new_path = new_name
#     os.rename(old_path, new_path)



## Run!!

In [28]:
model = BaseModel()
model.eval()
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-8, verbose=True)

infer_model = trains(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.24850] Val Loss : [0.25892] Val Weighted F1 Score : [0.63593]


  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.06440] Val Loss : [0.20265] Val Weighted F1 Score : [0.68957]


  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.04637] Val Loss : [0.19071] Val Weighted F1 Score : [0.70106]


  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.03895] Val Loss : [0.20395] Val Weighted F1 Score : [0.70521]


  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.02659] Val Loss : [0.18434] Val Weighted F1 Score : [0.73353]


## Inference

In [29]:
test = pd.read_csv('test.csv')

In [30]:
test_dataset = CustomDataset(test['img_path'].values, None, test_transform)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [31]:
def inference(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for imgs in tqdm(iter(test_loader)):
            imgs = imgs.float().to(device)
            
            pred = model(imgs)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()
    
    preds = le.inverse_transform(preds)
    return preds

In [41]:
preds

array(['훼손', '오염', '훼손', '몰딩수정', '훼손', '훼손',
       '훼손', '오염', '훼손', '훼손', '몰딩수정', '피스', '오염',
       '오염', '훼손', '오염', '오타공', '걸레받이수정', '오염',
       '오염', '곰팡이', '훼손', '훼손', '면불량', '꼬임',
       '훼손', '훼손', '오염', '훼손', '훼손', '훼손', '훼손',
       '훼손', '오염', '면불량', '훼손', '훼손', '훼손', '훼손',
       '훼손', '오염', '울음', '석고수정', '면불량', '훼손',
       '석고수정', '훼손', '터짐', '오염', '훼손', '오염', '오염',
       '훼손', '훼손', '훼손', '훼손', '훼손', '훼손', '훼손',
       '훼손', '오타공', '몰딩수정', '훼손', '오염', '훼손',
       '훼손', '오염', '오염', '오염', '피스', '오염', '오염',
       '훼손', '훼손', '훼손', '터짐', '훼손', '훼손', '터짐',
       '터짐', '꼬임', '오염', '훼손', '훼손', '걸레받이수정',
       '훼손', '꼬임', '훼손', '곰팡이', '면불량', '훼손',
       '훼

In [32]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/13 [00:00<?, ?it/s]

## Submission

In [39]:
submit = pd.read_csv('sample_submission.csv')
ss = submit.copy()

In [43]:
submit['label'] = preds

In [35]:
# result folder 생성
from datetime import datetime, timezone, timedelta

kst = timezone(timedelta(hours=9))
train_serial =  datetime.now(kst).strftime('%Y%m%d_%H%M%S')

Record_path = os.path.join('../result', train_serial)

os.makedirs(Record_path, exist_ok=True)




In [55]:
type(submit['label'][0])

str

In [37]:
submit.to_csv(os.path.join(Record_path,'submit.csv'), index=False)

In [67]:
submit['label'][0]

'훼손'

In [68]:
ss['label'][0]

'훼손'

In [69]:
import chardet
chardet.detect(submit['label'][0].encode())

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [70]:
chardet.detect(ss['label'][0].encode())

{'encoding': 'utf-8', 'confidence': 0.7525, 'language': ''}

In [57]:
submit[submit['label']==ss['label'][0]]

Unnamed: 0,id,label


In [60]:
submit

Unnamed: 0,id,label
0,TEST_000,훼손
1,TEST_001,오염
2,TEST_002,훼손
3,TEST_003,몰딩수정
4,TEST_004,훼손
...,...,...
787,TEST_787,꼬임
788,TEST_788,터짐
789,TEST_789,오염
790,TEST_790,훼손


In [38]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 792 entries, 0 to 791
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      792 non-null    object
 1   label   792 non-null    object
dtypes: object(2)
memory usage: 12.5+ KB


In [40]:
ss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 792 entries, 0 to 791
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      792 non-null    object
 1   label   792 non-null    object
dtypes: object(2)
memory usage: 12.5+ KB
