<a href="https://colab.research.google.com/github/DoItSon/playdata/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D/09_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 미세조정(Fine Tuning)
- 사전 학습되어있는 모델의 가중치를 이용하여 새로운 문제를 해결하기위해 최소한의 가중치를 추가해서 모델을 추가로 학습하는 방법

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random # 시드 고정을 위해
import os # 시드 고정을 위해

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

# 데이터 경로를 변경하시오

In [3]:
DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
df = pd.read_csv(f"{DATA_PATH}imdb.csv")
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 전이학습 실습해보기

In [6]:
model_name = "bert-base-uncased"

In [7]:
from transformers import AutoTokenizer, AutoModel # 토크나이저가 다르기 때문에 받아줘야 한다.

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 사전학습모델 토크나이저
- add_special_tokens
    - True: 특수 토큰 포함하겠다.
- max_length
    -  문장의 최대 길이 조절
- padding
    - max_length : 모델이 입력받을수 있는 최대 길이로 패딩 (보통은 512까지)
    - True : 패딩 여부
- truncation
    - True : 문장이 최대길이를 넘으면 자르겟다.


In [8]:
token = tokenizer(df["review"][0], add_special_tokens=True,padding="max_length", truncation=True) # return_tensor=를 사용하면 batch_size도 나오기 때문에 조절 해줘야 한다.
token

{'input_ids': [101, 1037, 2200, 1010, 2200, 1010, 2200, 4030, 1011, 3048, 1010, 6614, 3238, 3185, 2055, 1037, 24305, 1010, 15013, 2402, 2158, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# 학습데이터와 정답 데이터 생성

In [9]:
train = df["review"].to_numpy()
target = df["sentiment"].to_numpy().reshape(-1,1)
train.shape , target.shape

((748,), (748, 1))

# 데이터셋

In [10]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self ,tokenizer , x, y = None ): 
        self.tokenizer = tokenizer
        self.x = x
        self.y = y
    def __len__(self): 
        return self.x.shape[0]
    def __getitem__(self, idx): 
        item = {}
        item["x"] = self.__tokenizer(self.x[idx])  
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item
    def __tokenizer(self,text): # 토큰화를 하면 딕셔너리로 나오기 때문에 별도의 함수 __tokenizer를 만듦
        inputs = self.tokenizer(text, add_special_tokens=True,padding="max_length", truncation=True)
        for k, v in inputs.items(): 
            inputs[k] = torch.LongTensor(v) 
        return inputs

In [11]:
dt = ReviewDataset(tokenizer,train,target)
dl = torch.utils.data.DataLoader(dt, batch_size=1,shuffle=False) 
batch = next(iter(dl))
batch # 모델이 딕셔너리 형태로 못들어가기 때문에 **을붙임

{'x': {'input_ids': tensor([[  101,  1037,  2200,  1010,  2200,  1010,  2200,  4030,  1011,  3048,
           1010,  6614,  3238,  3185,  2055,  1037, 24305,  1010, 15013,  2402,
           2158,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     

# 사전학습모델 생성

In [12]:
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
batch["x"].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [14]:
outputs = model(**batch["x"]) # 언패킹으로 넣기 (모델마다 나오는 값이 다르다.)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [15]:
outputs["last_hidden_state"].shape # 입력길이와 피쳐차원이 나온다

torch.Size([1, 512, 768])

In [16]:
outputs["pooler_output"].shape # 임베딩차원

torch.Size([1, 768])

In [17]:
outputs[0].shape , outputs[1].shape 

(torch.Size([1, 512, 768]), torch.Size([1, 768]))

# 모델

In [18]:
class Net(torch.nn.Module):
    def __init__(self, model_name): 
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(self.model.config.hidden_size, 1) # .model.config.hidden_size = 768

    def forward(self, x):
        x = self.model(**x) # 딕셔너리 언패킹
        x = self.output_layer(x[1])
        return x

In [19]:
model = Net(model_name) # 사전학습을 시킬 때 가중치를 사용안했다는 뜻
model(batch["x"]) 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.1068]], grad_fn=<AddmmBackward0>)

In [20]:
def train_loop(dataloader,model,loss_fn,optimizer,device):
    epoch_loss = 0 
    model.train()
    for batch in tqdm(dataloader): 
        pred = model(batch["x"].to(device))
        loss = loss_fn(pred, batch["y"].to(device)) 
        
        optimizer.zero_grad() 
        loss.backward()  
        optimizer.step() 
        
        epoch_loss += loss.item() 

    epoch_loss /= len(dataloader) 

    return epoch_loss 

In [21]:
@torch.no_grad() 
def test_loop(dataloader,model,loss_fn,device): 
    epoch_loss = 0
    model.eval() 

    pred_list = []
    sig = torch.nn.Sigmoid()

    for batch in tqdm(dataloader):
        
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None: 
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()
        
        pred = sig(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)

    pred = np.concatenate(pred_list) 
    return epoch_loss , pred 

In [22]:
n_splits = 5
epochs = 20
batch_size = 16 # 32하면 GPU 용량 부족!
loss_fn = torch.nn.BCEWithLogitsLoss()

In [23]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=n_splits,shuffle=True, random_state=SEED)

In [24]:
from sklearn.metrics import accuracy_score

# 학습

- 학습률이 커서 lr을 넣어 튄다
    - lr를 넣는다.

In [None]:
is_holdout = True 
reset_seeds(SEED)
best_score_list = []
for i,(tri,vai) in enumerate(cv.split(train)):
    
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr=0.0001) 

    train_dt = ReviewDataset(tokenizer,train[tri],target[tri])
    valid_dt = ReviewDataset(tokenizer,train[vai],target[vai])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size,shuffle=False)

    best_score = 0
    patience = 0

    for epoch in range(epochs):
        
        train_loss = train_loop(train_dl, model, loss_fn,optimizer,device )
        valid_loss , pred = test_loop(valid_dl, model, loss_fn,device  )
        pred = (pred > 0.5).astype(int) 

        score = accuracy_score(target[vai],pred )
        patience += 1
        print(train_loss,valid_loss,score,sep="\t") 
        if best_score < score:
            patience = 0
            best_score = score
            torch.save(model.state_dict(),f"model_{i}.pth")

        if patience == 3:
            break
    
    best_score_list.append(best_score)

    if is_holdout:
        break

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]