# CommonLit Readability Score Prediction

This notebook presents a machine learning approach to predict the readability of literary excerpts using the CommonLit Readability Prize dataset with RoBERTa transformer model from Hugging Face's Transformers library in a PyTorch framework. The goal is to determine how complex or easy to read each excerpt is based on its text content.


## Import Necessary Libraries

In [15]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import RobertaModel, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

## Load and Prepare Data

In [16]:
train_df = pd.read_csv('commonlitreadabilityprize/train.csv')

train_texts, val_texts, train_targets, val_targets = train_test_split(
    train_df['excerpt'], train_df['target'], test_size=0.1, random_state=42
)

train_texts.reset_index(drop=True, inplace=True)
val_texts.reset_index(drop=True, inplace=True)
train_targets.reset_index(drop=True, inplace=True)
val_targets.reset_index(drop=True, inplace=True)

## Dataset Preparation

Define a PyTorch `Dataset` class, `CommonLitDataset`, which helps in processing the text data for model training by tokenizing and encoding the excerpts.


In [17]:
class CommonLitDataset(Dataset):
    def __init__(self, excerpts, targets, tokenizer, max_len=256):
        self.excerpts = excerpts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpts)

    def __getitem__(self, idx):
        excerpt = str(self.excerpts[idx])
        inputs = self.tokenizer.encode_plus(
            excerpt, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        if self.targets is not None:
            target = torch.tensor(self.targets[idx], dtype=torch.float32)
            return input_ids, attention_mask, target
        return input_ids, attention_mask

In [18]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_dataset = CommonLitDataset(train_texts, train_targets, tokenizer)
val_dataset = CommonLitDataset(val_texts, val_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [19]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(output.pooler_output)
        return self.out(output)

In [20]:
device = torch.device("cpu")
model = CommonLitModel()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CommonLitModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

## Training and Evaluation Functions

Define the `train` and `evaluate` functions which will be used to train the model on the training data and evaluate it on the validation data, respectively.

In [14]:
def train(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, targets in data_loader:
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        loss = nn.MSELoss()(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []
    with torch.no_grad():
        for input_ids, attention_mask, targets in data_loader:
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask).squeeze()
            loss = nn.MSELoss()(outputs, targets)
            total_loss += loss.item()
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())
    rmse = sqrt(mean_squared_error(actuals, predictions))
    return total_loss / len(data_loader), rmse

In [7]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device)
    val_loss, val_rmse = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation RMSE: {val_rmse:.4f}")

Epoch 1, Train Loss: 0.6027, Validation RMSE: 0.5906
Epoch 2, Train Loss: 0.3080, Validation RMSE: 0.5800
Epoch 3, Train Loss: 0.2097, Validation RMSE: 0.6336


## Generate Predictions for Test Data

Load the test data, use the trained model to predict readability scores, and generate a submission file as per the competition's format.

In [9]:
test_df = pd.read_csv('commonlitreadabilityprize/test.csv')

test_dataset = CommonLitDataset(test_df['excerpt'], None, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

def predict(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for input_ids, attention_mask in data_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids, attention_mask).squeeze()
            predictions.extend(outputs.cpu().numpy())
    return predictions

test_predictions = predict(model, test_loader, device)

In [11]:
sample_submission = pd.read_csv('commonlitreadabilityprize/sample_submission.csv')

sample_submission['target'] = test_predictions

sample_submission.to_csv("submission.csv", index=False)