<a href="https://colab.research.google.com/github/Dash400air/Bert_task/blob/main/MRPC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **MRPC** 2文が意味的に同じか否かを判別

# GPU Check

In [1]:
!nvidia-smi

Thu Sep 16 13:03:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Setup

In [2]:
!pip install transformers pytorch-lightning textstat



In [3]:
import os
import random

import pandas as pd
import numpy as np

import textstat

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

from transformers import RobertaTokenizer, RobertaForSequenceClassification

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [4]:
train = pd.read_csv('/content/drive/MyDrive/GRI/MRPC/MSRP/msr_paraphrase_train.txt',
                    sep='\t', error_bad_lines=False)
test = pd.read_csv('/content/drive/MyDrive/GRI/MRPC/MSRP/msr_paraphrase_test.txt',
                   sep='\t', error_bad_lines=False)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

# Config

In [5]:
class Config:
    def __init__(self):
        self.model = 'roberta-large'
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model)
        self.sc = RobertaForSequenceClassification
        self.text_max = 75
        self.seed = 42

Config = Config()

# Seed

In [6]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(Config.seed)

# Max length

In [7]:
def get_maxlength(df, columns:list):
    count_df = pd.DataFrame()
    for column in columns:
        texts = df[column].tolist()
        count = []
        for text in texts:
            text = str(text)
            count.append(textstat.lexicon_count(text, removepunct=False))
        count_df[column] = count
    return count_df

count_df = get_maxlength(train, ['#1 String', '#2 String'])
count_df.describe()

Unnamed: 0,#1 String,#2 String
count,3938.0,3938.0
mean,19.294058,18.945912
std,8.507456,7.204997
min,6.0,1.0
25%,15.0,15.0
50%,19.0,19.0
75%,23.0,23.0
max,317.0,252.0


In [8]:
count_df_t = get_maxlength(test, ['#1 String', '#2 String'])
count_df_t.describe()

Unnamed: 0,#1 String,#2 String
count,1639.0,1639.0
mean,19.411836,18.990848
std,13.607237,8.315691
min,7.0,1.0
25%,15.0,15.0
50%,19.0,18.0
75%,23.0,23.0
max,448.0,205.0


# Preprocessing

In [9]:
def strize(texts):
    texts_str = []
    for text in texts:
        text = str(text)
        texts_str.append(text)
    return texts_str

In [10]:
s1 = train['#1 String'].tolist()
s2 = train['#2 String'].tolist()

s1_str = strize(s1)
s2_str = strize(s2)

train['#1 String'] = s1_str
train['#2 String'] = s2_str

In [11]:
s1_t = test['#1 String'].tolist()
s2_t = test['#2 String'].tolist()

s1_str_t = strize(s1_t)
s2_str_t = strize(s2_t)

test['#1 String'] = s1_str_t
test['#2 String'] = s2_str_t

In [12]:
def preprocess(df):
    df['sentence'] = df['#1 String'] + ' [SEP] [CLS] ' + df['#2 String']
    return df

train = preprocess(train)
test = preprocess(test)

# Kfold

In [13]:
def get_train_data(train):
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["Quality"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

train = get_train_data(train)

# Dataset

In [14]:
class BaseDataset(Dataset):
    def __init__(self, df):
        self.df = df

        self.sentence = df['sentence'].tolist()
        self.labels = df['Quality']

        self.tokenizer = Config.tokenizer
        self.encoded = self.tokenizer(self.sentence,
                                 return_tensors='pt',
                                 max_length = Config.text_max,
                                 padding='max_length',
                                 truncation = True, 
                                 add_special_tokens=True,
                                 return_attention_mask=True
                                 )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = self.encoded['input_ids'][idx]
        attention_mask = self.encoded['attention_mask'][idx]
        label = torch.tensor(self.labels[idx])
        output = {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label}
        return output

# DataLoader

In [15]:
def get_dataloader(train, fold):
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds)
    valid_dataset = BaseDataset(valid_folds)

    train_loader = DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True,
            drop_last=True,
        )
    valid_loader = DataLoader(
            valid_dataset,
            batch_size=32,
            shuffle=False,
        )
    return train_loader, valid_loader

def get_testloader(test):
  test_dataset = BaseDataset(test)
  test_loader = DataLoader(
          test_dataset,
          batch_size=32,
          shuffle=False,
      )
  return test_loader

# Model

In [8]:
class SequenceClassification_pl(pl.LightningModule):

  def __init__(self, model_name, num_labels, lr):
    # model_name: Transformersのモデル名
    # num_labels: ラベルの数
    # lr: 学習率

    super().__init__()

    self.save_hyperparameters()
    
    self.bert_sc = Config.sc.from_pretrained(
        model_name,
        num_labels=num_labels
    )

  def forward(self, **inputs):
    return self.bert_sc(**inputs)

  def training_step(self, batch, batch_idx):
    output = self.bert_sc(**batch)
    loss = output.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    output = self.bert_sc(**batch)
    val_loss = output.loss
    self.log('val_loss', val_loss)

  def test_step(self, batch, batch_idx):
    labels = batch.pop('labels').detach().cpu().numpy() #labelsについて，GPU上のTensorではfbeta_scoreが受け付けてくれないため，CPUに移動させ，Numpyに変換
    output = self.bert_sc(**batch)
    labels_predicted = output.logits.detach().cpu().numpy().argmax(-1)  #同上

    accuracy = accuracy_score(labels, labels_predicted)
    recall = recall_score(labels, labels_predicted)
    precision = precision_score(labels, labels_predicted)
    f1 = f1_score(labels, labels_predicted)
    mcc = matthews_corrcoef(labels, labels_predicted)
    
    values = {'accuracy': accuracy, 'recall': recall,
              'precision': precision, 'f1': f1, 'matthews_corrcoef': mcc}
    self.log_dict(values)
  
  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
    scheduler = {'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer), 
                 'monitor': 'val_loss'}
    return [optimizer], scheduler

# Run

In [17]:
model = SequenceClassification_pl(
    Config.model, 
    num_labels=2, 
    lr=2e-5)

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

In [18]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

trainer = pl.Trainer(
    gpus=1,
    max_epochs=10,
    callbacks = [checkpoint, early_stopping]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [19]:
dataloader_train, dataloader_val = get_dataloader(train, 1)

In [20]:
trainer.fit(model, dataloader_train, dataloader_val)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                             | Params
-------------------------------------------------------------
0 | bert_sc | RobertaForSequenceClassification | 355 M 
-------------------------------------------------------------
355 M     Trainable params
0         Non-trainable params
355 M     Total params
1,421.447 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Test

In [21]:
dataloader_test = get_testloader(test)

In [22]:
result = trainer.test(test_dataloaders=dataloader_test)

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.7126296758651733,
 'f1': 0.8164318799972534,
 'matthews_corrcoef': 0.2747410535812378,
 'precision': 0.7014365792274475,
 'recall': 0.9868611097335815}
--------------------------------------------------------------------------------


# Examination

In [9]:
best_model = '/content/drive/MyDrive/GRI/MRPC/epoch=1-step=195.ckpt'
predict_model = SequenceClassification_pl.load_from_checkpoint(best_model)
bert_sc = predict_model.bert_sc.cuda()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

In [18]:
def examine(text):
    encoded = Config.tokenizer(
        texts,
        padding='longest',
        return_tensors='pt'
    )

    encoded = {k: v.cuda() for k, v in encoded.items()}

    with torch.no_grad():
      output = bert_sc(**encoded)
    label_predicted = output.logits.detach().cpu().numpy().argmax(-1)

    for text, label in zip(texts, label_predicted.tolist()):
        print(f'"{text}"')
        if label:
            print('Equivalent')
        else:
            print('Inequivalent')
        print('----------------------')

In [20]:
texts = ["I'd like to play soccer. [SEP] [CLS] I want to play football.",
         "I do not think so. [SEP] [CLS] I agree.",
         "Reading books is thought to be good for mental health. [SEP] [CLS] He always turns up late.",
         "In a few year, people are able to travel moon without much money. [SEP] [CLS] Space travel will be affordable soon for any people."]

examine(texts)

"I'd like to play soccer. [SEP] [CLS] I want to play football."
Equivalent
----------------------
"I do not think so. [SEP] [CLS] I agree."
Equivalent
----------------------
"Reading books is thought to be good for mental health. [SEP] [CLS] He always turns up late."
Inequivalent
----------------------
"In a few year, people are able to travel moon without much money. [SEP] [CLS] Space travel will be affordable soon for any people."
Equivalent
----------------------
