## 정형 sklearn

# library import

In [1]:
import os
import struct
import numpy as np
import matplotlib.pyplot as plt

import gzip
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from tqdm import tqdm
import time

## 이미지 추출, 이미지 출력, 라벨 list 생성 함수 (dataload)
[함수 인자 입력 순서](https://naon.me/posts/til18)

In [2]:
def extract_imgs(n, train = True):
    if train:
        imgs = gzip.open("train-images-idx3-ubyte.gz", 'r')
    else:
        imgs = gzip.open("t10k-images-idx3-ubyte.gz", 'r')
    
    img_size = 28
    num_imgs = n
    
    imgs.read(16) # 꼭 있어야 하는 코드였다. 출력을 위한게 아니었음.
    
    buf = imgs.read(img_size * img_size * num_imgs)
    data = np.frombuffer(buf, dtype = np.uint8).astype(np.float32)
    data = data.reshape(num_imgs, img_size, img_size, 1)
    
    return data

def print_imgs(data, idx):
    img = np.asarray(data[idx]).squeeze()
    plt.imshow(img)
    plt.show()

def label_li(fin, train = True):
    li = []
    
    if train:
        labels = gzip.open("train-labels-idx1-ubyte.gz", 'r')
    else:
        labels = gzip.open("t10k-labels-idx1-ubyte.gz", 'r')
    
    labels.read(8) # 꼭 있어야 하는 코드였다. 출력을 위한게 아니었음.
    
    for i in range(0, fin): # start 늘린 수 만큼 뒤쪽 출력이 안됨.
        buf = labels.read(1)
        label = np.frombuffer(buf, dtype = np.uint8).astype(np.int64)
        li.extend(list(label))
        
    return li

In [17]:
### test cell
imgs = gzip.open("train-images-idx3-ubyte.gz", 'r')
# list(imgs)
img_size = 28
num_imgs = 10000
imgs.read(16)
buf = imgs.read(img_size * img_size * num_imgs)
data = np.frombuffer(buf, dtype = np.uint8).astype(np.float32)
data.shape
data

b'\x00\x00\x08\x03\x00\x00\xea`\x00\x00\x00\x1c\x00\x00\x00\x1c'

(7840000,)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

## 환경설정

In [3]:
import torch
import torch.nn as nn
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # Arrange GPU devices starting from 0
os.environ['CUDA_VISIBLE_DEVICES'] = '2' # Set the GPU 2 to use
# 멀티 gpu 시스템은 하나의 GPU에서 코드가 실행되게 프로그래밍을 해도 모든 GPU에 메모리를 할당함.
# 따라서 2번 gpu만 사용하도록 코드를 짜 준 것.
## 돌아가는 것 확인 함.

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
#GPU 체크 및 할당
if torch.cuda.is_available():    
    #device = torch.device("cuda:0")
    print('Device:', device)
    print('Current cuda device:', torch.cuda.current_device()) # 현재 cuda가 사용할 GPU
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

Device: cuda
Current cuda device: 0
There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3090


NVIDIA GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



## 하이퍼파라미터 튜닝

In [5]:
CFG = {
    'IMG_SIZE':28, #이미지 사이즈
    'EPOCHS':50, #에포크
    'LEARNING_RATE':1e-5, #학습률
    'BATCH_SIZE':32, #배치사이즈
    'SEED':41, #시드
}

### 모델의 재현성을 위한 random seed 고정

In [6]:
# Seed 고정
import random
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed) # 랜덤시드 고정
    torch.cuda.manual_seed(seed) # gpu 사용할 경우 랜덤시드 고정
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(CFG['SEED'])

In [7]:
numdat = 10000
data = extract_imgs(numdat)
labels = label_li(numdat)

In [9]:
data.shape

(10000, 28, 28, 1)

In [9]:
## reshape to 2d array
data = data.reshape(-1, 28*28)

In [17]:
data.shape

(10000, 784)

In [22]:
type(labels)

list

In [20]:
data[:3]
labels[:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

[5, 0, 4]

In [22]:
import pandas as pd

In [23]:
dat = pd.DataFrame(data)
dat['labels'] = labels

In [24]:
dat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [26]:
dat.labels.value_counts()

1    1127
7    1070
3    1032
6    1014
0    1001
2     991
4     980
9     978
8     944
5     863
Name: labels, dtype: int64

In [33]:
dat = pd.concat([dat[dat.labels==0][:863], dat[dat.labels==1][:863], dat[dat.labels==2][:863], dat[dat.labels==3][:863], dat[dat.labels==4][:863], dat[dat.labels==5][:863], dat[dat.labels==6][:863], dat[dat.labels==7][:863], dat[dat.labels==8][:863], dat[dat.labels==9][:863]], axis = 0, ignore_index = True)

In [38]:
863*0.75

647.25

In [44]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(dat, test_size = 0.2, shuffle = True)

In [45]:
train.labels.value_counts()
valid.labels.value_counts()

1    703
4    698
7    695
3    693
0    693
5    691
6    690
8    681
2    681
9    679
Name: labels, dtype: int64

9    184
8    182
2    182
6    173
5    172
0    170
3    170
7    168
4    165
1    160
Name: labels, dtype: int64

In [49]:
import lightgbm as lgbm
LGBM = lgbm.LGBMClassifier()

In [51]:
LGBM.fit(train.iloc[:,:-1], train.iloc[:,-1])

LGBMClassifier()

In [56]:
from sklearn.metrics import accuracy_score as acc
results = []
y_pred = LGBM.predict(valid.iloc[:,:-1])
        
# y_pred_test = model_.predict(X_test) # 예측

acc_score = acc(valid.iloc[:,-1], y_pred)

results.append(['LGBM', acc_score]) #[:str(model).index("(")]

print(results[:10])


[['LGBM', 0.9582850521436849]]
