In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#import os
#assert 'COLAB_TPU_ADDR' in os.environ, 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [3]:
#!pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp310-cp310-linux_x86_64.whl

In [4]:
#import torch_xla.core.xla_model as xm
#device = xm.xla_device()

In [5]:
import pandas as pd
from pathlib import Path

df = pd.read_csv('drive/My Drive/data/train_df.csv')
train_df = df.copy()

In [6]:
df1 = pd.read_csv('drive/My Drive/data/val_df.csv')
val_df = df1.copy()

In [7]:
df2 = pd.read_csv('drive/My Drive/data/test_df.csv')
test_df = df2.copy()

In [8]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
Colle

In [10]:
from transformers import AutoTokenizer
from transformers import AutoModel

In [11]:
checkpoint = 'xlm-roberta-base'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [13]:
def tokenize(df):
  encoded_text = []
  for i,j in zip(df['premise'],df['hypothesis']):
    encoded_text.append(tokenizer(i,j,padding='max_length',return_tensors='pt'))
  return encoded_text

In [14]:
#编写数据集类
class dataset(Dataset):
  def __init__(self,df):
    self.texts = tokenize(df)
    self.labels = df['label']

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self,idx):
    return self.texts[idx],self.labels[idx]

train_dataset = dataset(train_df)
val_dataset = dataset(val_df)

In [15]:
#编写网络
class xlm_network(nn.Module):
  def __init__(self):
    super(xlm_network,self).__init__()
    self.xlm_roberta = model
    self.linear = nn.Sequential(nn.Linear(768,256),nn.Linear(256,3))


  def forward(self,input_id,mask):
    latent_state = self.xlm_roberta(input_ids= input_id, attention_mask=mask).last_hidden_state[:,0]
    linear_output = self.linear(latent_state)
    return linear_output

In [16]:
xlm_model = xlm_network()

In [17]:
#编写训练loop

def train_step(train_data,val_data,optimizer,loss_fn,model,device,epochs):
  train,val = dataset(train_data),dataset(val_data)
  #分批
  train_dataloader = DataLoader(dataset=train,batch_size=2,shuffle=True)
  len(train_dataloader)
  val_dataloader = DataLoader(dataset=val,batch_size=2)
  #利用GPU
  model.to(device)

  #编写训练循环
  for epoch in range(epochs):
    model.train()
    train_acc,train_loss = 0,0
    for X,y in tqdm(train_dataloader):
      y = y.to(device)
      output = model(input_id=X['input_ids'].squeeze(1).to(device), mask=X['attention_mask'].to(device))

      batch_loss = loss_fn(output,y)
      train_loss += batch_loss.item()

      acc = (output.argmax(dim=1)==y).sum().item()
      train_acc +=acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    train_acc = train_acc/len(train)
    train_loss = train_loss/len(train_dataloader)

    model.eval()
    #验证集的损失和准确率
    val_loss,val_acc = 0,0
    with torch.no_grad():
      for X,y in tqdm(val_dataloader):
        y = y.to(device)
        output = model(input_id=X['input_ids'].squeeze(1).to(device), mask=X['attention_mask'].to(device))

        val_loss += loss_fn(output,y).item()
        val_acc +=(output.argmax(dim=1)==y).sum().item()

    val_loss = val_loss/len(val_dataloader)
    val_acc = val_acc/len(val)

    print(f'EPOCH:{epoch}|train_acc:{train_loss:.4f}|train_acc:{train_acc:.4f}')
    print(f'EPOCH:{epoch}|val_loss:{val_loss:.4f}|val_acc:{val_acc:.4f}')

In [18]:
torch.manual_seed(42)
EPOCH = 10
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(xlm_model.parameters(),lr=1e-6)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
train_step(train_data=train_df,
      val_data=val_df,
      optimizer=optimizer,
      loss_fn=loss_fn,
      model=xlm_model,
      device=device,
      epochs=EPOCH)

100%|██████████| 4542/4542 [21:13<00:00,  3.57it/s]
100%|██████████| 1518/1518 [01:38<00:00, 15.45it/s]


EPOCH:0|train_acc:1.0951|train_acc:0.3664
EPOCH:0|val_loss:1.0743|val_acc:0.4374


100%|██████████| 4542/4542 [21:17<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:38<00:00, 15.48it/s]


EPOCH:1|train_acc:1.0133|train_acc:0.5101
EPOCH:1|val_loss:0.8713|val_acc:0.6140


100%|██████████| 4542/4542 [21:17<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.55it/s]


EPOCH:2|train_acc:0.8632|train_acc:0.6225
EPOCH:2|val_loss:0.7747|val_acc:0.6650


100%|██████████| 4542/4542 [21:16<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.56it/s]


EPOCH:3|train_acc:0.7830|train_acc:0.6746
EPOCH:3|val_loss:0.7614|val_acc:0.6782


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:38<00:00, 15.49it/s]


EPOCH:4|train_acc:0.7193|train_acc:0.7177
EPOCH:4|val_loss:0.7592|val_acc:0.6789


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.51it/s]


EPOCH:5|train_acc:0.6546|train_acc:0.7454
EPOCH:5|val_loss:0.7887|val_acc:0.6795


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.53it/s]


EPOCH:6|train_acc:0.6023|train_acc:0.7720
EPOCH:6|val_loss:0.7613|val_acc:0.7052


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.54it/s]


EPOCH:7|train_acc:0.5513|train_acc:0.7948
EPOCH:7|val_loss:0.7711|val_acc:0.7072


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.51it/s]


EPOCH:8|train_acc:0.5046|train_acc:0.8172
EPOCH:8|val_loss:0.8039|val_acc:0.7095


100%|██████████| 4542/4542 [21:15<00:00,  3.56it/s]
100%|██████████| 1518/1518 [01:37<00:00, 15.54it/s]


EPOCH:9|train_acc:0.4584|train_acc:0.8366
EPOCH:9|val_loss:0.8549|val_acc:0.7042


In [20]:
torch.save(obj=xlm_model.state_dict(),
           f='drive/My Drive/data/xlmr_large_fintuing.pth')