# 320_Fine_Tuning_Custom_Datasets

In [2]:
# !pip install transformers

In [3]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch.nn.functional as F
import pandas as pd

In [4]:
df=messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5572, 2)

In [6]:
X=list(df['message'])

In [7]:
y=list(df['label'])

In [8]:
y[:10]

['ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam']

In [9]:
pd.get_dummies(y).head(3)

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1


get_dummies : 
 
    drop_first : bool, default False. Whether to get k-1 dummies out of k categorical levels by removing the first level.

In [10]:
y = list(pd.get_dummies(y, drop_first=True)['spam'])
y[:10]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

len(X_train), len(X_test), len(y_train), len(y_test)

(4457, 1115, 4457, 1115)

In [12]:
X_train[:10]

["No I'm good for the movie, is it ok if I leave in an hourish?",
 'If you were/are free i can give. Otherwise nalla adi entey nattil kittum',
 'Have you emigrated or something? Ok maybe 5.30 was a bit hopeful...',
 'I just got home babe, are you still awake ?',
 'Kay... Since we are out already ',
 "Me i'm not workin. Once i get job...",
 'What he said is not the matter. My mind saying some other matter is there.',
 'Oh yeah! And my diet just flew out the window',
 'sorry, no, have got few things to do. may be in pub later.',
 'Ill call you evening ill some ideas.']

### 1. Call the pre-trained model
### 2. Call the tokenizer
이제 토큰화를 처리해 보겠습니다. 우리는 사전 훈련된 DistilBert를 사용하여 분류기를 훈련할 것이므로 DistilBert 토크나이저를 사용합시다.

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [15]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

### Convert encodings to Tensors

이제 레이블과 인코딩을 Dataset 개체로 변환해 보겠습니다. PyTorch에서 이것은 `torch.utils.data.Dataset` 객체를 서브클래싱하고 `__len__` 및 `__getitem__`을 구현하여 수행됩니다. TensorFlow에서 입력 인코딩과 레이블을 `from_tensor_slices` 생성자 메서드에 전달합니다. 
배치 인코딩의 각 키가 우리가 훈련할 모델의 `DistilBertForSequenceClassification.forward` 메소드의 명명된 매개변수에 해당하도록 쉽게 배치할 수 있습니다.

In [None]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, y_train)
test_dataset = IMDbDataset(test_encodings, y_test)

이제 데이터 세트가 준비되었으므로 🤗 `Trainer`/`TFTrainer` 또는 기본 PyTorch/TensorFlow를 사용하여 모델을 미세 조정할 수 있습니다. [training](https://huggingface.co/transformers/training.html)을 참조하세요.

- Training warmup steps :  

    - 이는 일반적으로 설정된 수의 훈련 단계(워밍업 단계)에 대해 매우 낮은 학습률을 사용한다는 것을 의미합니다. 워밍업 단계 후에 "일반" 학습률 또는 학습률 스케줄러를 사용합니다. 또한 워밍업 단계 수에 따라 학습률을 점진적으로 높일 수 있습니다.

- weight_decay : 가중치 감쇠. L2 regularization

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

### Training

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

Step,Training Loss
10,0.6955
20,0.6759
30,0.6449
40,0.5979
50,0.5422
60,0.4517
70,0.361
80,0.2061
90,0.2152
100,0.1016


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1116, training_loss=0.08447323672287332, metrics={'train_runtime': 208.9983, 'train_samples_per_second': 42.651, 'train_steps_per_second': 5.34, 'total_flos': 548894189854416.0, 'train_loss': 0.08447323672287332, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1115
  Batch size = 16


{'epoch': 2.0,
 'eval_loss': 0.03940277174115181,
 'eval_runtime': 8.0746,
 'eval_samples_per_second': 138.087,
 'eval_steps_per_second': 8.669}

In [None]:
prediction = trainer.predict(test_dataset)
prediction

***** Running Prediction *****
  Num examples = 1115
  Batch size = 16


PredictionOutput(predictions=array([[ 4.1589484, -3.634178 ],
       [-3.631867 ,  3.9939091],
       [ 4.0680013, -3.5911086],
       ...,
       [ 4.171697 , -3.7119305],
       [-3.7207432,  4.046697 ],
       [ 4.061095 , -3.5852869]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 1, 0]), metrics={'test_loss': 0.03940277174115181, 'test_runtime': 7.526, 'test_samples_per_second': 148.154, 'test_steps_per_second': 9.301})

In [None]:
y_logit = torch.tensor(prediction[0])
y_logit

tensor([[ 4.1589, -3.6342],
        [-3.6319,  3.9939],
        [ 4.0680, -3.5911],
        ...,
        [ 4.1717, -3.7119],
        [-3.7207,  4.0467],
        [ 4.0611, -3.5853]])

In [None]:
y_pred = F.softmax(y_logit).argmax(axis=1).numpy()
y_pred

  """Entry point for launching an IPython kernel.


array([0, 1, 0, ..., 0, 1, 0])

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(accuracy_score(y_test, y_pred))

cm=confusion_matrix(y_test, y_pred)
cm

0.9937219730941704


array([[952,   3],
       [  4, 156]])

In [None]:
trainer.save_model('senti_model')

Saving model checkpoint to senti_model
Configuration saved in senti_model/config.json
Model weights saved in senti_model/pytorch_model.bin


### Fine-tuning with native PyTorch/TensorFlow

네이티브 PyTorch 또는 TensorFlow를 사용하여 교육할 수도 있습니다.

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()