# Imports

Need download transformers

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

For training I want to use gpu, so I choose device "cuda" if it is available

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Dataset Overview

On drive I have dataset, downloaded from kaggle: [CommonLit Readability Prize](https://www.kaggle.com/c/commonlitreadabilityprize/overview)

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


My dataset saved in /content/drive/MyDrive/Hugging_Text_Classification/commonlitreadabilityprize

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Hugging_Text_Classification/commonlitreadabilityprize/train.csv')
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [6]:
df_test = pd.read_csv('/content/drive/MyDrive/Hugging_Text_Classification/commonlitreadabilityprize/test.csv')
df_test.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


# Initializing for Model

Firstly, I create classes for dataset loader

In [7]:
#creating dataset class
class Dataset_Preprocessing(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        target = self.targets.iloc[idx]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'target': torch.tensor(target, dtype=torch.float)}

In [8]:
class Dataset_Preprocessing_Test(Dataset):
    def __init__(self, texts, tokenizer, ids, max_length):
        self.tokenizer = tokenizer
        self.ids = ids
        self.texts = texts
        self.max_length = max_length
        # self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=self.max_length)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts, padding=True, truncation=True, max_length=self.max_length)
        out_dic = {key: torch.tensor(val[idx]) for key, val in encodings.items()}
        out_dic["ids"] = self.ids[idx]
        return out_dic

    def __len__(self):
        return len(self.ids)

I use pretrained model, and add to last layer another Linear layer, because here I have linear regression task for the target(float numbers)

In [9]:
#creating model class
class Text_Model(nn.Module):
    def __init__(self, model_name):
        super(Text_Model, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # the last layer from pretrained
        pooler_output = outputs['pooler_output']
        return self.linear(pooler_output)

In [10]:
# training loop
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['target'].to(device)
        output = model(input_ids, attention_mask)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

In [11]:
#validation loop
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)
            output = model(input_ids, attention_mask)
            loss = criterion(output, target)
            running_loss += loss.item()
    return running_loss / len(dataloader)

To start training and validation I need split train data

In [12]:
train_texts, val_texts, train_targets, val_targets = train_test_split(df['excerpt'], df['target'], test_size=0.2, random_state=42)

In [13]:
#initialize the tokenizer and create datasets
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 256

train_dataset = Dataset_Preprocessing(train_texts, train_targets, tokenizer, max_length)
val_dataset = Dataset_Preprocessing(val_texts, val_targets, tokenizer, max_length)
test_dataset = Dataset_Preprocessing_Test(df_test["excerpt"].to_list(), tokenizer, df_test["id"].to_list(), max_length)


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
#create dataloaders
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[   0,  133, 2166,  ...,    1,    1,    1],
        [   0, 1779,   51,  ..., 2901,    5,    2],
        [   0,  133, 1940,  ...,    1,    1,    1],
        ...,
        [   0, 2387, 1150,  ...,    1,    1,    1],
        [   0, 2847,    5,  ...,    1,    1,    1],
        [   0,  495, 5655,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'target': tensor([-1.8203, -0.3154, -3.0707, -1.1627, -1.2384, -2.0819, -0.2229, -0.5258,
         0.0314, -2.3024, -0.7979, -1.0776, -1.7894, -1.6665,  0.4046, -0.8320,
        -1.0401, -1.8170,  0.8772, -1.1013, -2.0578, -0.1079,  0.2555, -0.6988,
        -1.7891, -2.1068, -2.2514, -1.3315, -1.7349, -2.5426,  0.5720, -0.3915])}


# Model Training

In [16]:
model = Text_Model(model_name).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Training the model
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    val_loss = evaluate(model, val_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/10, Train Loss: 1.2172, Validation Loss: 1.0552
Epoch 2/10, Train Loss: 1.0837, Validation Loss: 1.1131
Epoch 3/10, Train Loss: 1.0862, Validation Loss: 1.0562
Epoch 4/10, Train Loss: 1.0961, Validation Loss: 1.0500
Epoch 5/10, Train Loss: 1.0797, Validation Loss: 1.0568
Epoch 6/10, Train Loss: 1.0792, Validation Loss: 1.0505
Epoch 7/10, Train Loss: 1.0766, Validation Loss: 1.0495
Epoch 8/10, Train Loss: 1.0880, Validation Loss: 1.0506
Epoch 9/10, Train Loss: 1.0772, Validation Loss: 1.0478
Epoch 10/10, Train Loss: 1.0782, Validation Loss: 1.0465


# Save and Use Trained Model

In [18]:
# save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/Hugging_Text_Classification/commonlit_model.pth')

In [19]:
#load the model for evaluation
model.load_state_dict(torch.load('/content/drive/MyDrive/Hugging_Text_Classification/commonlit_model.pth'))

<All keys matched successfully>

In [20]:
val_loss = evaluate(model, val_dataloader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")

Validation Loss: 1.0465


In [21]:
def get_predictions(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = model(input_ids, attention_mask)
            predictions.extend(output.squeeze().tolist())
    return predictions

val_predictions = get_predictions(model, val_dataloader, device)

Prediction for test/submit file

In [22]:
test_prediction = get_predictions(model, test_dataloader, device)

In [23]:
from tqdm import tqdm
preds = []
ids = []
model.eval()
with torch.no_grad():
    for idx, sample in enumerate(tqdm(test_dataloader, position=0, leave=True)):
        input_ids = sample['input_ids'].to(device)
        attention_mask = sample['attention_mask'].to(device)
        ids.extend(sample["ids"])
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds.extend([float(i) for i in outputs.squeeze()])


100%|██████████| 1/1 [00:00<00:00,  8.06it/s]


In [24]:
test_prediction

[-0.9395949840545654,
 -0.9432989954948425,
 -0.9275826811790466,
 -0.9481966495513916,
 -0.9487736821174622,
 -0.9415751695632935,
 -0.9024478197097778]