<a href="https://colab.research.google.com/github/Dash400air/Bert_task/blob/main/SST_2ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPU check

In [8]:
!nvidia-smi

Mon Sep 20 08:13:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Setup

In [9]:
!pip install transformers pytorch-lightning textstat



In [10]:
import os
import random

import pandas as pd
import numpy as np

import textstat

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

from transformers import RobertaTokenizer, RobertaForSequenceClassification

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [11]:
train = pd.read_csv('/content/drive/MyDrive/GRI/SST-2/SST-2/train.tsv', sep='\t')
test = pd.read_csv('/content/drive/MyDrive/GRI/SST-2/SST-2/dev.tsv', sep='\t')
exam= pd.read_csv('/content/drive/MyDrive/GRI/SST-2/SST-2/test.tsv', sep='\t')

# Max length

In [12]:
train['label'].value_counts()

1    37569
0    29780
Name: label, dtype: int64

In [13]:
def get_maxlength(df, columns:list):
    count_df = pd.DataFrame()
    for column in columns:
        texts = df[column].tolist()
        count = []
        for text in texts:
            text = str(text)
            count.append(textstat.lexicon_count(text, removepunct=False))
        count_df[column] = count
    return count_df

count_df = get_maxlength(train, ['sentence'])
count_df.describe()

Unnamed: 0,sentence
count,67349.0
mean,9.409553
std,8.073806
min,1.0
25%,3.0
50%,7.0
75%,13.0
max,52.0


In [14]:
count_df = get_maxlength(test, ['sentence'])
count_df.describe()

Unnamed: 0,sentence
count,872.0
mean,19.548165
std,8.7639
min,2.0
25%,13.0
50%,19.0
75%,26.0
max,47.0


In [15]:
count_df = get_maxlength(exam, ['sentence'])
count_df.describe()

Unnamed: 0,sentence
count,1821.0
mean,19.233937
std,8.922386
min,2.0
25%,12.0
50%,18.0
75%,25.0
max,56.0


# Config

In [16]:
class Config:
    def __init__(self):
        self.model = 'roberta-base'
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model)
        self.sc = RobertaForSequenceClassification
        self.text_max = 35
        self.seed = 42

Config = Config()

# Seed

In [17]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(Config.seed)

# Kfold

In [18]:
def get_train_data(train):
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["label"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

train = get_train_data(train)

In [19]:
train.head()

Unnamed: 0,sentence,label,fold
0,hide new secretions from the parental units,0,2
1,"contains no wit , only labored gags",0,4
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,1
4,on the worst revenge-of-the-nerds clichés the ...,0,4


# Dataset

In [20]:
class BaseDataset(Dataset):
    def __init__(self, df):
        self.df = df

        self.sentence = df['sentence'].tolist()
        self.labels = df['label']

        self.tokenizer = Config.tokenizer
        self.encoded = self.tokenizer(self.sentence,
                                 return_tensors='pt',
                                 max_length = Config.text_max,
                                 padding='max_length',
                                 truncation = True, 
                                 add_special_tokens=True,
                                 return_attention_mask=True
                                 )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = self.encoded['input_ids'][idx]
        attention_mask = self.encoded['attention_mask'][idx]
        label = torch.tensor(self.labels[idx])
        output = {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label}
        return output

# Dataloader

In [28]:
def get_dataloader(train, fold):
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds)
    valid_dataset = BaseDataset(valid_folds)

    train_loader = DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True,
            drop_last=True,
        )
    valid_loader = DataLoader(
            valid_dataset,
            batch_size=32,
            shuffle=False,
        )
    return train_loader, valid_loader

def get_testloader(test):
  test_dataset = BaseDataset(test)
  test_loader = DataLoader(
          test_dataset,
          batch_size=32,
          shuffle=False,
      )
  return test_loader

# Model

In [22]:
class SequenceClassification_pl(pl.LightningModule):

  def __init__(self, model_name, num_labels, lr):
    # model_name: Transformersのモデル名
    # num_labels: ラベルの数
    # lr: 学習率

    super().__init__()

    self.save_hyperparameters()
    
    self.bert_sc = Config.sc.from_pretrained(
        model_name,
        num_labels=num_labels
    )

  def forward(self, **inputs):
    return self.bert_sc(**inputs)

  def training_step(self, batch, batch_idx):
    output = self.bert_sc(**batch)
    loss = output.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    output = self.bert_sc(**batch)
    val_loss = output.loss
    self.log('val_loss', val_loss)

  def test_step(self, batch, batch_idx):
    labels = batch.pop('labels').detach().cpu().numpy() #labelsについて，GPU上のTensorではfbeta_scoreが受け付けてくれないため，CPUに移動させ，Numpyに変換
    output = self.bert_sc(**batch)
    labels_predicted = output.logits.detach().cpu().numpy().argmax(-1)  #同上

    accuracy = accuracy_score(labels, labels_predicted)
    recall = recall_score(labels, labels_predicted)
    precision = precision_score(labels, labels_predicted)
    f1 = f1_score(labels, labels_predicted)
    mcc = matthews_corrcoef(labels, labels_predicted)
    
    values = {'accuracy': accuracy, 'recall': recall,
              'precision': precision, 'f1': f1, 'matthews_corrcoef': mcc}
    self.log_dict(values)
  
  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
    scheduler = {'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer), 
                 'monitor': 'val_loss'}
    return [optimizer], scheduler

# Run

In [23]:
model = SequenceClassification_pl(
    Config.model, 
    num_labels=2, 
    lr=2e-5)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [24]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

trainer = pl.Trainer(
    gpus=-1,
    max_epochs=10,
    callbacks = [checkpoint, early_stopping]
)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [25]:
dataloader_train, dataloader_val = get_dataloader(train, 1)

In [26]:
trainer.fit(model, dataloader_train, dataloader_val)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                             | Params
-------------------------------------------------------------
0 | bert_sc | RobertaForSequenceClassification | 124 M 
-------------------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.589   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Test

In [29]:
dataloader_test = get_testloader(test)

In [30]:
result = trainer.test(test_dataloaders=dataloader_test)

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.9311926364898682,
 'f1': 0.9291689395904541,
 'matthews_corrcoef': 0.8620530366897583,
 'precision': 0.9301403164863586,
 'recall': 0.9335532784461975}
--------------------------------------------------------------------------------


# Examination

In [31]:
best_model = '/content/drive/MyDrive/GRI/SST-2/epoch=2-step=5048.ckpt'
predict_model = SequenceClassification_pl.load_from_checkpoint(best_model)
bert_sc = predict_model.bert_sc.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [34]:
def examine(texts):
    encoded = Config.tokenizer(
        texts,
        padding='longest',
        return_tensors='pt'
    )

    encoded = {k: v.cuda() for k, v in encoded.items()}

    with torch.no_grad():
      output = bert_sc(**encoded)
    label_predicted = output.logits.detach().cpu().numpy().argmax(-1)

    for text, label in zip(texts, label_predicted.tolist()):
        print(f'"{text}"')
        if label:
            print('Positive')
        else:
            print('Negative')
        print('----------------------')

データ数：1821\
エンコード＋推測時間：7秒

---


1データあたり3.8(ms)

In [36]:
exam.shape

(1821, 2)

In [35]:
examine(exam['sentence'].tolist())

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
Negative
----------------------
"this insightful , oscar-nominated documentary , in which children on both sides of the ever-escalating conflict have their say away from watchful parental eyes , gives peace yet another chance ."
Positive
----------------------
"all but the most persnickety preteens should enjoy this nonthreatening but thrilling adventure ."
Positive
----------------------
"i admired this work a lot ."
Positive
----------------------
"this concoction , so bizarre to the adult mind , is actually a charming triumph where its intended under-12 audience is concerned ."
Positive
----------------------
"( a ) rare , beautiful film ."
Positive
----------------------
"the pianist is a fine valedictory work for polanski , made richer by his own experiences , making his other movies somehow richer in the bargain ."
Positive
----------------------
"if you 're like me , a sucker for a good old fashion romance and someone who shamelessly