In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch 
from torch import nn

The goal of this notebook is to provide a prelimnary EDA and technique to go about the [Competition](https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview) while also learning some things from other people's work. I have tried to link to other people's work at the end of this notebook. I will be making a competition focused notebook for submission as I can't submit this one (accesses the internet) and I'm unwilling to change any part of the code in this notebook.

## Config

In [None]:
# parameters for training

EPOCHS = 5
model_name = 'bert-base-uncased'
num_labels = 2
learning_rate = 0.01
BATCH_SIZE = 20

# If there are GPUs available, use the first one 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Loading the data and EDA

In [None]:
summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')

summaries.shape, prompts.shape

In [None]:
summaries.head(10)

In [None]:
summaries['prompt_id'].nunique()

From this we see that we have only 4 `prompt_ids`. Let's look into the prompt_train dataset to see what they're really about

In [None]:
prompts.head()

Let's investigate about the `prompt_text` to understand more about the given dataset

In [None]:
prompts['prompt_text'].iloc[0]

In [None]:
prompts['prompt_question'].iloc[0]

In [None]:
texts = summaries[summaries['prompt_id'] == '39c16e'].loc[:,['text', 'content', 'wording']].values.tolist()

for text in texts[:3]:
    print(f"Text: {text[0]}")
    print(f"Content score: {text[1]}")
    print(f"Wording score: {text[2]}")
    print('\n')

And for the second `prompt_id` in the prompt dataset

In [None]:
prompts['prompt_text'].iloc[1]

In [None]:
prompts['prompt_question'].iloc[1]

In [None]:
texts = summaries[summaries['prompt_id'] == '3b9047'].loc[:,['text', 'content', 'wording']].values.tolist()

for text in texts[:3]:
    print(f"Text: {text[0]}")
    print(f"Content score: {text[1]}")
    print(f"Wording score: {text[2]}")
    print('\n')

We have seen a couple of samples from the dataset. Let's dive into the `content and wording scores` from the train dataset.

In [None]:
summaries[['content', 'wording']].describe()

We see that the values for both `content` and `wording` ranges from about -2 to about 5.

Let's take a look at the entry with the lowest content scores as well as its prompt question.

In [None]:
entries = summaries[summaries['content'] == summaries['content'].min()].loc[:, ['prompt_id','text']].values.tolist()

for i in range(len(entries)):
    idx = entries[i][0]
    
    print(f"Prompt question: {(prompts[prompts['prompt_id'] == idx].loc[:, 'prompt_question'].values.tolist())[0]}")
    
    print(f"Summarized Text: {entries[i][1]}")
    
    print('\n')

and the entry(s) with the max content score...

In [None]:
entries = summaries[summaries['content'] == summaries['content'].max()].loc[:, ['prompt_id','text']].values.tolist()

for i in range(len(entries)):
    idx = entries[i][0]
    
    print(f"Prompt question: {(prompts[prompts['prompt_id'] == idx].loc[:, 'prompt_question'].values.tolist())[0]}")
    
    print(f"Summarized Text: {entries[i][1]}")
    
    print('\n')

As we can see, the entries with the better content score has richer details compared to that with lower content score. As content score is highly correlated with wording score, there's no need to repeat this check on that feature.

Let's merge the summaries with the prompts.

In [None]:
train = summaries.merge(prompts, on = 'prompt_id', how = 'inner')
train.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

train.head(10)

## Feature Engineering

In [None]:
train['text_len'] = train['text'].apply(lambda x: len(x.split()))
train['prompt_len'] = train['prompt_text'].apply(lambda x: len(x.split()))
train['length_ratio'] = train['text_len'] / train['prompt_len']

train.head(10)

In [None]:
sns.displot(train['text_len'])
plt.title('Word Frequency Distribution in the dataset')
plt.xlabel('No. of words')
plt.ylabel('Frequency')

In [None]:
corr = train[['content', 'wording', 'text_len', 'length_ratio']].corr()

corr

As we see, having a high number of words in the summary is highly correlated to having a good content and wording score. Let's see the `text` from the entry with the least and most amount of words.

In [None]:
entry = train[train['text_len'] == train['text_len'].max()].loc[:, ['text', 'content', 'wording']].values.tolist()

for i in range(len(entry)):
    print(f"Text: {entry[i][0]}")
    print(f"Content score: {entry[i][1]}")
    print(f"Wording score: {entry[i][2]}")
    
    print('\n')

In this case, while the summarized entry gave a lot of details and has a high content score, his wording score was quite poor.

In [None]:
entry = train[train['text_len'] == train['text_len'].min()].loc[:, ['text', 'content', 'wording']].values.tolist()

for i in range(len(entry)):
    print(f"Text: {entry[i][0]}")
    print(f"Content score: {entry[i][1]}")
    print(f"Wording score: {entry[i][2]}")
    
    print('\n')

#### Misspell check

This was taken from this [notebook](https://www.kaggle.com/code/vassylkorzh/feature-engineering/notebook). 

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

In [None]:
import nltk

In [None]:
spellchecker = SpellChecker()

def get_misspelled_count(text):
    tokens = nltk.word_tokenize(text)
    misspelled = [token for token in spellchecker.unknown(tokens) if token.isalpha()]
    
    return len(misspelled)

In [None]:
train['misspelled'] = train['text'].apply(lambda x: get_misspelled_count(x))

In [None]:
train.head(10)

In [None]:
corr = train[['text_len', 'length_ratio', 'misspelled', 'content', 'wording']].corr()

corr

[More Feature Engineering ideas](https://www.kaggle.com/code/vassylkorzh/feature-engineering/notebook)

## Text Preprocessing

In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, feature_cols, targets, tokenizer, max_length):
        self.texts = texts
        self.feature_cols = feature_cols
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        
        item = {'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float), 
               'target': torch.tensor(self.targets[idx], dtype=torch.float)}
        
        return item

In [None]:
train.head()

In [None]:
feature_cols = ['text_len', 'prompt_len', 'length_ratio', 'misspelled']
targets = ['content', 'wording']
MAX_LENGTH = 512
feature_col_size = len(feature_cols)

dataset = TextDataset(texts = train['text'],
                     feature_cols = train[feature_cols].values,
                     targets = train[targets].values,
                     tokenizer = tokenizer,
                     max_length = MAX_LENGTH)

In [None]:
dataset[1]

## Modelling

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_col_size):
        """Instantiate a model that can fit on the dataset"""
        
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)
        self.numerics = nn.Linear(feature_col_size, 16)
        self.final_layer = nn.Linear(16 + num_labels, num_labels)
        
    def forward(self, input_ids, attention_mask, feature_cols):
        text_output = self.model(input_ids = input_ids, attention_mask = attention_mask)
        numerics = self.numerics(feature_cols)
        concat_features = torch.cat([text_output.logits, numerics], dim = 1)
        
        final_output = self.final_layer(concat_features)
        
        return final_output

In [None]:
model = CustomModel(model_name, num_labels, feature_col_size)
model.to(device)

In [None]:
# loss and optimizer

from torch.optim import Adam
from torch.nn import MSELoss


optimizer = Adam(model.parameters(), lr = learning_rate)
loss_function = MSELoss()

In [None]:
# split the dataset to train and validation dataset

from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(dataset, test_size = 0.2, shuffle = True, random_state = 42)

In [None]:
print(len(train_dataset), len(val_dataset))

In [None]:
# load the dataset using pytorch's dataloader tool

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE)

We have `len(train_dataset)//batch_size` steps in training the dataset. From above, this is about 286 steps

In [None]:
def train_step(train_loader):
    """The training loop for the dataset"""
    
    for step, batch in enumerate(train_loader):
        epochal_loss = 0
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        targets = batch['target'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask, feature_cols)
        loss = loss_function(outputs, targets)
        epochal_loss += loss
        loss.backward()
        
        optimizer.step()
        
        if step % 50 == 0:
            print(f"Epoch {epoch + 1} Step {step} Loss {loss.item()}")
            
    print(f"Epoch {epoch + 1} Train Loss: {epochal_loss/len(train_loader)}")

In [None]:
def val_step(val_loader):
    """The validation loop"""
    
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            epochal_loss = 0
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            feature_cols = batch['feature_cols'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, feature_cols)
            loss = loss_function(outputs, targets)
            epochal_loss += loss
        
        print(f"Epoch {epoch + 1} Validation Loss: {epochal_loss/len(val_loader)}")
        print('\n')

In [None]:
import time

for epoch in range(EPOCHS):
    start = time.time()
    
    train_step(train_loader)
    val_step(val_loader)
    print(f"Total time for training epoch {epoch + 1}: {time.time() - start}s")
    print('\n')

## Evaluation

In [None]:
test_summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_prompt = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')


# feature engineering on the test set

test = test_summaries.merge(test_prompt, on = 'prompt_id', how = 'inner')
test.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

test['text_len'] = test['text'].apply(lambda x: len(x.split()))
test['prompt_len'] = test['prompt_text'].apply(lambda x: len(x.split()))
test['length_ratio'] = test['text_len'] / train['prompt_len']

test['misspelled'] = test['text'].apply(lambda x: get_misspelled_count(x))

In [None]:
class TestTextDataset(Dataset):
    def __init__(self, texts, feature_cols, tokenizer, max_length):
        self.texts = texts
        self.feature_cols = feature_cols
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        
        item = {'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float)
               }
        
        return item

In [None]:
test_dataset = TestTextDataset(test['text'],
                              test[feature_cols].values,
                              tokenizer,
                              MAX_LENGTH)
test_loader = DataLoader(test_dataset)

In [None]:
preds = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        
        outputs = model(input_ids, attention_mask, feature_cols)
        preds.extend(outputs.cpu().numpy())

In [None]:
preds

## Submission

In [None]:
preds[0]

In [None]:
submission = pd.DataFrame({
    'student_id': test_summaries['student_id'],
    'content': [pred[0] for pred in preds],
    'wording': [pred[1] for pred in preds]
})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index = False)

## Resources

1. [Beginner Friendly BERT](https://www.kaggle.com/code/suraj520/beginner-friendly-bert)
2. [Deberta + Additional features](https://www.kaggle.com/code/suraj520/beginner-friendly-bert)