In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch 
from torch import nn



## Possible Future Work

1. [LGBM + Deberta](https://www.kaggle.com/code/siddhvr/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned#Dataload)
2. [LGBM + Deberta 2](https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-no-autocorrect)

In [2]:
# parameters for training

EPOCHS = 5
model_name = '/kaggle/input/bert-base-uncased'
num_labels = 2
learning_rate = 0.01
BATCH_SIZE = 20

# If there are GPUs available, use the first one 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')

summaries.shape, prompts.shape

((7165, 5), (4, 4))

In [4]:
train = summaries.merge(prompts, on = 'prompt_id', how = 'inner')
train.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

In [5]:
train['text_len'] = train['text'].apply(lambda x: len(x.split()))
train['prompt_len'] = train['prompt_text'].apply(lambda x: len(x.split()))
train['length_ratio'] = train['text_len'] / train['prompt_len']

In [6]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    return text

In [7]:
train['text'] = train['text'].apply(lambda x: preprocess_text(x))
train['prompt_question'] = train['prompt_question'].apply(lambda x: preprocess_text(x))
train['prompt_text'] = train['prompt_text'].apply(lambda x: preprocess_text(x))
train['prompt_title'] = train['prompt_title'].apply(lambda x: preprocess_text(x))

In [8]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, feature_cols, targets, tokenizer, max_length):
        self.texts = texts
        self.feature_cols = feature_cols
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        
        item = {'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float), 
               'target': torch.tensor(self.targets[idx], dtype=torch.float)}
        
        return item

In [10]:
feature_cols = ['text_len', 'prompt_len', 'length_ratio']
targets = ['content', 'wording']
MAX_LENGTH = 512
feature_col_size = len(feature_cols)

dataset = TextDataset(texts = train['text'],
                     feature_cols = train[feature_cols].values,
                     targets = train[targets].values,
                     tokenizer = tokenizer,
                     max_length = MAX_LENGTH)

## Modelling

In [11]:
from transformers import AutoModelForSequenceClassification

In [12]:
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_col_size):
        """Instantiate a model that can fit on the dataset"""
        
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)
        self.numerics = nn.Linear(feature_col_size, 16)
        self.final_layer = nn.Linear(16 + num_labels, num_labels)
        
    def forward(self, input_ids, attention_mask, feature_cols):
        text_output = self.model(input_ids = input_ids, attention_mask = attention_mask)
        numerics = self.numerics(feature_cols)
        concat_features = torch.cat([text_output.logits, numerics], dim = 1)
        
        final_output = self.final_layer(concat_features)
        
        return final_output

In [13]:
model = CustomModel(model_name, num_labels, feature_col_size)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bi

In [14]:
# loss and optimizer

from torch.optim import Adam
from torch.nn import MSELoss


optimizer = Adam(model.parameters(), lr = learning_rate)
loss_function = MSELoss()

In [15]:
# split the dataset to train and validation dataset

from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(dataset, test_size = 0.2, shuffle = True, random_state = 42)

In [16]:
print(len(train_dataset), len(val_dataset))

5732 1433


In [17]:
# load the dataset using pytorch's dataloader tool

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE)

In [18]:
def train_step(train_loader):
    """The training loop for the dataset"""
    
    for step, batch in enumerate(train_loader):
        epochal_loss = 0
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        targets = batch['target'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask, feature_cols)
        loss = loss_function(outputs, targets)
        epochal_loss += loss
        loss.backward()
        
        optimizer.step()
        
        if step % 50 == 0:
            print(f"Epoch {epoch + 1} Step {step} Loss {loss.item()}")
            
    print(f"Epoch {epoch + 1} Train Loss: {epochal_loss/len(train_loader)}")

In [19]:
def val_step(val_loader):
    """The validation loop"""
    
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            epochal_loss = 0
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            feature_cols = batch['feature_cols'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, feature_cols)
            loss = loss_function(outputs, targets)
            epochal_loss += loss
        
        print(f"Epoch {epoch + 1} Validation Loss: {epochal_loss/len(val_loader)}")

In [20]:
import time

for epoch in range(EPOCHS):
    start = time.time()
    
    train_step(train_loader)
    val_step(val_loader)
    print(f"Total time for training epoch {epoch + 1}: {time.time() - start}s")
    print('\n')

Epoch 1 Step 0 Loss 35007.8515625
Epoch 1 Step 50 Loss 97.30899810791016
Epoch 1 Step 100 Loss 32.684730529785156
Epoch 1 Step 150 Loss 1.8288627862930298
Epoch 1 Step 200 Loss 0.6277834177017212
Epoch 1 Step 250 Loss 0.5309807658195496
Epoch 1 Train Loss: 0.0021536957938224077
Epoch 1 Validation Loss: 0.011408806778490543
Total time for training epoch 1: 309.76351833343506s


Epoch 2 Step 0 Loss 0.8010343909263611
Epoch 2 Step 50 Loss 0.48241645097732544
Epoch 2 Step 100 Loss 0.9379791617393494
Epoch 2 Step 150 Loss 1.0201877355575562
Epoch 2 Step 200 Loss 0.5293674468994141
Epoch 2 Step 250 Loss 0.4820473790168762
Epoch 2 Train Loss: 0.001962867099791765
Epoch 2 Validation Loss: 0.011274609714746475
Total time for training epoch 2: 308.59387969970703s


Epoch 3 Step 0 Loss 0.8681197166442871
Epoch 3 Step 50 Loss 0.36797699332237244
Epoch 3 Step 100 Loss 0.8551939129829407
Epoch 3 Step 150 Loss 1.224487066268921
Epoch 3 Step 200 Loss 0.44947120547294617
Epoch 3 Step 250 Loss 0.4987812

## Evaluation

In [21]:
test_summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_prompt = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')

student_ids = test_summaries['student_id'].values.tolist()

In [22]:
# feature engineering on the test set

test = test_summaries.merge(test_prompt, on = 'prompt_id', how = 'inner')
test.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

test['text_len'] = test['text'].apply(lambda x: len(x.split()))
test['prompt_len'] = test['prompt_text'].apply(lambda x: len(x.split()))
test['length_ratio'] = test['text_len'] / train['prompt_len']

In [23]:
test['text'] = test['text'].apply(lambda x: preprocess_text(x))
test['prompt_question'] = test['prompt_question'].apply(lambda x: preprocess_text(x))
test['prompt_text'] = test['prompt_text'].apply(lambda x: preprocess_text(x))
test['prompt_title'] = test['prompt_title'].apply(lambda x: preprocess_text(x))

In [24]:
class TestTextDataset(Dataset):
    def __init__(self, texts, feature_cols, tokenizer, max_length):
        self.texts = texts
        self.feature_cols = feature_cols
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        
        item = {'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float)
               }
        
        return item

In [25]:
test_dataset = TestTextDataset(test['text'],
                              test[feature_cols].values,
                              tokenizer,
                              MAX_LENGTH)
test_loader = DataLoader(test_dataset)

In [26]:
preds = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        
        outputs = model(input_ids, attention_mask, feature_cols)
        preds.extend(outputs.cpu().numpy())

## Submission

In [27]:
submission = pd.DataFrame({
    'student_id': student_ids,
    'content': [pred[0] for pred in preds],
    'wording': [pred[1] for pred in preds]
})

In [28]:
def is_valid_float(x):
    return isinstance(x, float) and x == x  # This checks that x is not NaN since NaN != NaN in Python.

cols_to_check = ['wording', 'content']
submission[cols_to_check] = submission[cols_to_check].applymap(lambda x: x if is_valid_float(x) else 0.0)


In [29]:
submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-0.784851,-0.053023
1,111111eeeeee,-0.784851,-0.053023
2,222222cccccc,-0.784851,-0.053023
3,333333dddddd,-0.784851,-0.053023


In [30]:
submission.to_csv('submission.csv', index = False)