In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


import timm
import torch
import torch.nn as nn
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder
from torch.optim.lr_scheduler import CosineAnnealingLR
import random
import torch.backends.cudnn as cudnn
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer,AutoImageProcessor

import cv2
from PIL import Image
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from matplotlib import pyplot as plt
import seaborn as sns
# pip uninstall charset-normalizer
# pip install charset-normalizer

import warnings
warnings.filterwarnings("ignore")

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import wandb
from tqdm import tqdm
import fastdup
from augraphy import *

In [None]:
def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)
random_seed(624)

In [None]:
class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if len(img.shape) < 3 or img.shape[2] != 3:
            img = np.stack([img] * 3, axis=-1)
        if self.transform:
            img = self.transform(image=img)['image']
        return img, target,name

In [None]:
break

## Eda

## Save Augmentation

In [None]:
def augmentation(origin_img_path,origin_csv,aug_img_path,aug_csv_path,transform,pipeline,N_ALBUM,N_AUGRAPHY):
    
    os.makedirs(aug_img_path, exist_ok=True)
    aug_data=pd.DataFrame(columns=['ID','target'])
    
    for i in tqdm(range(len(origin_csv))):
        image_id=origin_csv.iloc[i]['ID']
        target=origin_csv.iloc[i]['target'] # 원본 이미지의 라벨값
        alpha,save_count = 0, 0
        image = cv2.imread(f'{origin_img_path}{image_id}')
            
        if(target in [3,4,7]):alpha=10
        elif(target==14):alpha=20
        
        # Augraphy
        # https://github.com/sparkfish/augraphy
        for i in range(N_AUGRAPHY+alpha):
            transformed_image = pipeline(image)
            #transformed_image = np.clip(transformed_image, 0, 255).astype(np.uint8) # Convert to uint8
            save_count += 1
            cv2.imwrite(f'{aug_img_path}{image_id[:-4]}_{save_count}.jpg', transformed_image)
        
        # Albumentation
        # https://github.com/albumentations-team/albumentations
        for i in range(N_ALBUM):
            transformed_image = transform(image=image)['image']
            #transformed_image = np.clip(transformed_image, 0, 255).astype(np.uint8) # Convert to uint8
            save_count += 1
            cv2.imwrite(f'{aug_img_path}{image_id[:-4]}_{save_count}.jpg', transformed_image)
            
        cv2.imwrite(f'{aug_img_path}{image_id}', image) # 원본이미지 재저장
        
        length=N_ALBUM+N_AUGRAPHY+alpha+1
        tmp = pd.DataFrame({'ID': [f'{image_id[:-4]}_{i}.jpg' for i in range(1,length)]})
        tmp['target'] = target
        aug_data = pd.concat([aug_data, tmp], ignore_index=True) 
        aug_data = pd.concat([aug_data, pd.DataFrame({'ID': f'{image_id}', 'target': [target]})], ignore_index=True) 
        
    aug_data.to_csv(aug_csv_path,index=False)    
        
    return aug_data

# def makedf(origin_csv,aug_csv_path,N_ALBUM,N_AUGRAPHY):
#     length=N_ALBUM+N_AUGRAPHY+1
    
#     aug_data=pd.DataFrame(columns=['ID','target'])

#     for i in tqdm(range(len(origin_csv))):
#         image_id=origin_csv.iloc[i]['ID'] # 원본 이미지 파일이름
#         target=origin_csv.iloc[i]['target'] # 원본 이미지의 라벨값
        
#         tmp = pd.DataFrame({'ID': [f'{image_id[:-4]}_{i}.jpg' for i in range(1,length)]})
#         tmp['target'] = target
#         aug_data = pd.concat([aug_data, tmp], ignore_index=True) 
#         aug_data = pd.concat([aug_data, pd.DataFrame({'ID': f'{image_id}', 'target': [target]})], ignore_index=True) 
        
#     print("Origin data length: ",len(aug_data)//length)
#     print("Augment data length: ", len(aug_data)-len(aug_data)//length)
    
#     aug_data.to_csv(aug_csv_path,index=False)
     
#     return aug_data

In [None]:
ink_phase = AugmentationSequence([OneOf(
        [
            InkShifter(
                text_shift_scale_range=(18, 27),
                text_shift_factor_range=(1, 4),
                text_fade_range=(0, 2),
                blur_kernel_size=(5, 5),
                blur_sigma=0,
                noise_type="random",
            ),
            BleedThrough(
                intensity_range=(0.1, 0.3),
                color_range=(32, 224),
                ksize=(17, 17),
                sigmaX=1,
                alpha=random.uniform(0.1, 0.2),
                offsets=(10, 20),
            ),
        ],
        p=1.0,
    ),])

paper_phase = AugmentationSequence([OneOf(
    [
	AugmentationSequence(
     [
	    NoiseTexturize(sigma_range=(3, 10), turbulence_range=(2, 5), texture_width_range=(300, 500), texture_height_range=(300, 500), p=1),
	    BrightnessTexturize(texturize_range=(0.9, 0.99), deviation=0.03, p=1),
    ]),
	AugmentationSequence(
     [
	    BrightnessTexturize(texturize_range=(0.9, 0.99), deviation=0.03, p=1),
	    NoiseTexturize(sigma_range=(3, 10), turbulence_range=(2, 5), texture_width_range=(300, 500), texture_height_range=(300, 500), p=1),
    ])
    ], p=0.5),
])

post_phase = AugmentationSequence([
	OneOf([
	GlitchEffect(glitch_direction=random, glitch_number_range=(8, 16), glitch_size_range=(5, 50), glitch_offset_range=(10, 50), p=1),
	ColorShift(color_shift_offset_x_range=(3, 5), color_shift_offset_y_range=(3, 5), color_shift_iterations=(2, 3), color_shift_brightness_range=(0.9, 1.1), color_shift_gaussian_kernel_range=(3, 3), p=1)
], p=0.2),
	OneOf([
	DirtyDrum(line_width_range=(1, 6), line_concentration=0.10080769473847595, direction=0, noise_intensity=0.7786602736626571, noise_value=(64, 224), ksize=(7, 7), sigmaX=0,p=0.2),
	DirtyRollers(line_width_range=(2, 32), scanline_type=0, numba_jit=1, p=1)
], p=0.2),
	OneOf([
	LightingGradient(light_position=None, direction=None, max_brightness=255, min_brightness=0, mode='gaussian', linear_decay_rate=None, transparency=None, numba_jit=1, p=1),
	Brightness(brightness_range=(0.9, 1.1), min_brightness=0, min_brightness_value=(120, 150), numba_jit=1, p=1),
	Gamma(gamma_range=(0.9, 1.1), p=1)
], p=0.2),
	OneOf([
	SubtleNoise(subtle_range=6, p=1),
	Jpeg(quality_range=(25, 95), p=1)
], p=0.2),
	OneOf([
	BadPhotoCopy(noise_mask=None, noise_type=-1, noise_side=random, noise_iteration=(1, 2), noise_size=(1, 3), noise_value=[128, 196], noise_sparsity=[0.3, 0.6], noise_concentration=[0.1, 0.6], blur_noise=True, blur_noise_kernel=(7, 7), wave_pattern=False, edge_effect=True, numba_jit=1, p=1),
	ShadowCast(shadow_side=random, shadow_vertices_range=(1, 20), shadow_width_range=(0.3, 0.8), shadow_height_range=(0.3, 0.8), shadow_color=(0, 0, 0), shadow_opacity_range=(0.2, 0.9), shadow_iterations_range=(1, 2), shadow_blur_kernel_range=(101, 301), p=1)
], p=0.2),
	Folding(fold_x=None, fold_deviation=(0, 0), fold_count=2, fold_noise=0.01, fold_angle_range=(-360, 360), gradient_width=(0.1, 0.2), gradient_height=(0.01, 0.02), backdrop_color=(0, 0, 0), p=1)
])

In [None]:
pipeline = AugraphyPipeline(ink_phase=ink_phase, paper_phase=paper_phase, post_phase=post_phase)

In [None]:
transform = A.Compose([
A.RandomRotate90(p=0.3),
A.HorizontalFlip(p=0.3),
A.VerticalFlip(p=0.3), 
A.GaussNoise(p=0.3),
A.OneOf([A.MotionBlur(p=.2), A.MedianBlur(blur_limit=3, p=0.1), A.Blur(blur_limit=3, p=0.1),], p=0.3),
A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2, border_mode=cv2.BORDER_CONSTANT, value=[255, 255, 255]),
A.OneOf([A.OpticalDistortion(p=0.3), A.GridDistortion(p=.1), A.PiecewiseAffine(p=0.3), ], p=0.2),
A.OneOf([A.CLAHE(clip_limit=2), A.Sharpen(), A.Emboss(),], p=0.3),
A.RandomBrightnessContrast(p=0.3),
A.HueSaturationValue(p=0.3)
])
# transform = A.Compose([
#                 A.Rotate(limit=5, border_mode=cv2.BORDER_CONSTANT),
#                 A.HorizontalFlip(p=0.3),
#                 A.VerticalFlip(p=0.3),  
#                 A.RandomRotate90(p=0.3),  
#                 A.Blur(blur_limit=4, p=0.3),  
#                 A.OpticalDistortion(p=0.3),  
#                 A.GridDistortion(p=0.3),  
#                 A.HueSaturationValue(hue_shift_limit=5, sat_shift_limit=20, val_shift_limit=10, p=0.3), 
#                 A.RandomBrightnessContrast(p=0.2),
#                 A.ShiftScaleRotate(shift_limit=(0.3), scale_limit=(0.3), border_mode=cv2.BORDER_CONSTANT)
#             ])

In [None]:
train_origin=pd.read_csv("data/train_right.csv")

train_data, valid_data = train_test_split(train_origin, test_size=0.2, stratify=train_origin['target'], random_state=624)
valid_data, test_data = train_test_split(valid_data, test_size=0.5, stratify=valid_data['target'], random_state=624)

print(len(train_data), len(valid_data), len(test_data))

train_data.reset_index(drop=True,inplace=True)
valid_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [None]:
dataframe=[train_data, valid_data,test_data]
prefix=['train','valid','test']
num= [[30,10],[15,5],[15,5]]   # [N_Albumnetation, N_Augraphy]

for i in range(len(dataframe)):
    print(f"{prefix[i]} Augmentation start")
    print("#"*30)
    
    origin_img_path='data/train/' # 원래 이미지 폴더
    origin_csv=dataframe[i]

    aug_img_path=f'data/aug_{prefix[i]}/'
    aug_csv_path=f"data/aug_{prefix[i]}.csv"

    aug_data = augmentation(origin_img_path,origin_csv,aug_img_path,aug_csv_path,transform,pipeline,num[i][0],num[i][1])
    #aug_data= makedf(origin_csv,aug_csv_path,num[i][0],num[i][1])


In [None]:
break

## Upload wandb later

bae951753/Document Images Classification/nod23v2g

In [None]:
import wandb
from tqdm import tqdm

id_list=['nod23v2g']

lbscore_list=[0.9084]
#config_list=["CosineAnnealingLR"]
#tag_list=['LR_scheduler=CosineAnnealingLR']

print(len(id_list))

In [None]:
for i in tqdm(range(len(id_list))):
    run= wandb.init(project="Document Images Classification",id=id_list[i],resume='allow')
    run.config.update({'LB score':lbscore_list[i]})
    #run.config.update({'lr_scheduler':config_list[0]})
    #run.tags+=(tag_list[0],)
    run.finish()

In [None]:
break

## fastdup

In [None]:
## https://visual-layer.readme.io/docs/analyzing-labeled-images
## https://github.com/visual-layer/fastdup?tab=readme-ov-file

In [None]:
# class와 label이 매핑되어있는 파일 
train=pd.read_csv("data/train.csv")
train['ID'] = "data/train/" + train['ID'].astype(str)
train.columns=['filename','label']


meta_df=pd.read_csv('data/meta.csv')
label2id = dict(zip(meta_df['class_name'], meta_df['target']))
id2label = dict(zip(meta_df['target'], meta_df['class_name']))

train['label'] = train['label'].map(id2label)

train.head()

In [None]:
fd = fastdup.create(input_dir="data/train")
fd.run(annotations=train, model_path='clip',d=17)

In [None]:
fd.vis.duplicates_gallery()    # create a visual gallery of duplicates
fd.vis.outliers_gallery()      # create a visual gallery of anomalies
fd.vis.component_gallery()     # create a visualization of connected components
fd.vis.stats_gallery()         # create a visualization of images statistics (e.g. blur)
fd.vis.similarity_gallery()    # create a gallery of similar images

In [None]:
similar=fd.similarity()
report = similar.query("label_from != label_to and distance >= 0.95")
report.reset_index(drop=True,inplace=True)
report['sorted_from_to'] = report.apply(lambda row: sorted([row['from'], row['to']]), axis=1)

# 중복된 행을 제거합니다. 
report.drop_duplicates(subset='sorted_from_to', keep='first', inplace=True)

# 정렬된 from과 to 컬럼 및 중복 제거된 sorted_from_to 컬럼을 삭제합니다.
report.drop(columns=[ 'sorted_from_to'], inplace=True)
report.reset_index(drop=True,inplace=True)
report

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# 각 행별로 이미지를 읽어와서 subplot에 표시
fig, axs = plt.subplots(nrows=report.shape[0] // 2, ncols=4, figsize=(20, 4*(report.shape[0] // 2)))

for i in range(0, report.shape[0], 2):
    row1 = report.iloc[i]
    row2 = report.iloc[i+1]
    
    # 홀수 row
    filename_from1 = row1['filename_from']
    filename_to1 = row1['filename_to']
    
    label_from1 = row1['label_from']
    label_to1 = row1['label_to']
    
    id_from1= filename_from1[11:-4]
    id_to1= filename_to1[11:-4]
    
    # 짝수row 
    filename_from2 = row2['filename_from']
    filename_to2 = row2['filename_to']

    label_from2 = row2['label_from']
    label_to2 = row2['label_to']
    
    id_from2= filename_from2[11:-4]
    id_to2= filename_to2[11:-4]
    
    image_from1 = Image.open(filename_from1)
    image_to1 = Image.open(filename_to1)
    image_from2 = Image.open(filename_from2)
    image_to2 = Image.open(filename_to2)
    
    axs[i // 2, 0].imshow(image_from1)
    axs[i // 2, 0].set_title(f"Label: {label_from1} \n id: {id_from1}")
    axs[i // 2, 0].axis('off')
    
    axs[i // 2, 1].imshow(image_to1)
    axs[i // 2, 1].set_title(f"Label: {label_to1} \n id: {id_to1}")
    axs[i // 2, 1].axis('off')
    
    axs[i // 2, 2].imshow(image_from2)
    axs[i // 2, 2].set_title(f"Label: {label_from2}\n id: {id_from2}")
    axs[i // 2, 2].axis('off')
    
    axs[i // 2, 3].imshow(image_to2)
    axs[i // 2, 3].set_title(f"Label: {label_to2}\n id: {id_to2}")
    axs[i // 2, 3].axis('off')

plt.tight_layout()
plt.show()


In [None]:
train=pd.read_csv("data/train.csv")
meta=pd.read_csv('data/meta.csv')

In [None]:
# 중복->라벨링 오류인걸 없앰
train_right = train[~train['ID'].isin(['aec62dced7af97cd.jpg', 'c5182ab809478f12.jpg', '1ec14a14bbe633db.jpg'])]

In [None]:
# 오류 아닌데 구분하기 어려운거 
train_right = train_right[~train_right['ID'].isin(['4a38e395726fbc06.jpg', 'af650bfc45cb3c46.jpg', 'dda2df9797b370e7.jpg','b709b64897d9233f.jpg'])]

In [None]:
train_right[train_right['ID']=='8646f2c3280a4f49.jpg']
train_right.loc[train_right['ID'] == '8646f2c3280a4f49.jpg', 'target'] = 3

In [None]:
train_right[train_right['ID']=='45f0d2dfc7e47c03.jpg']
train_right.loc[train_right['ID'] == '45f0d2dfc7e47c03.jpg', 'target'] = 7

In [None]:
train_right=train_right.reset_index(drop=True)
train_right.to_csv("data/train_right.csv",index=False)

In [4]:
data=pd.read_csv("data/aug_train.csv")
data2=pd.read_csv("data/aug_valid.csv")
data3=pd.read_csv("data/aug_test.csv")

len(data),len(data2),len(data3)

(54380, 3666, 3697)