In [1]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=ed4af99cfe413db17f43469239acbf159cc771b0a7968c5ff4faea24d40a3212
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch 
from torch import nn
import re
import nltk
from nltk.corpus import stopwords
from torch.utils.data import Dataset
from autocorrect import Speller



In [3]:
summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')

summaries.shape, prompts.shape

((7165, 5), (4, 4))

## Config

In [4]:
# parameters for training

EPOCHS = 5
model_name = '/kaggle/input/bert-base-uncased'
num_labels = 2
learning_rate = 0.001
BATCH_SIZE = 20
MAX_LENGTH = 512

# If there are GPUs available, use the first one 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
STOP_WORDS = set(stopwords.words('english'))

## Feature Engineering

### Utilty Functions

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    return text

def is_valid_float(x):
    return isinstance(x, float) and x == x  # This checks that x is not NaN since NaN != NaN in Python.

In [7]:
from spellchecker import SpellChecker
spellchecker = SpellChecker()

def get_misspelled_count(text):
    tokens = nltk.word_tokenize(text)
    misspelled = [token for token in spellchecker.unknown(tokens) if token.isalpha()]
    
    return len(misspelled)

In [8]:
def quotes_count(row):
    summary = row['text']
    text = row['prompt_text']
    quotes_from_summary = re.findall(r'"([^"]*)"', summary)
    
    if len(quotes_from_summary)>0:
        return [quote in text for quote in quotes_from_summary].count(True)
    else:
        return 0

In [9]:
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

def ngrams(token, n):
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def ngrams_co_occurence(row, n):
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']
    
    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))
    
    common_ngrams = original_ngrams.intersection(summary_ngrams)
    return len(common_ngrams)

In [10]:
def check_is_stop_word(word):
    return word in STOP_WORDS

In [11]:
def word_overlap_count(row):
    """ intersection(prompt_text, text) """    
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    if STOP_WORDS:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    return len(set(prompt_words).intersection(set(summary_words)))

### For the train dataset

In [12]:
train = summaries.merge(prompts, on = 'prompt_id', how = 'inner')
train.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

In [13]:
train['text_len'] = train['text'].apply(lambda x: len(x.split()))
train['prompt_len'] = train['prompt_text'].apply(lambda x: len(x.split()))
train['length_ratio'] = train['text_len'] / train['prompt_len']

In [14]:
train['text'] = train['text'].apply(lambda x: preprocess_text(x))
train['prompt_question'] = train['prompt_question'].apply(lambda x: preprocess_text(x))
train['prompt_text'] = train['prompt_text'].apply(lambda x: preprocess_text(x))
train['prompt_title'] = train['prompt_title'].apply(lambda x: preprocess_text(x))

train['misspelled'] = train['text'].apply(lambda x: get_misspelled_count(x))

In [15]:
train['prompt_tokens'] = train['prompt_text'].apply(lambda x: word_tokenize(x))
train['summary_tokens'] = train['text'].apply(lambda x: word_tokenize(x))

train['word_overlap_count'] = train.apply(word_overlap_count, axis = 1)
train['bigram_overlap_count'] = train.apply(ngrams_co_occurence, args=(2,), axis = 1)
train['trigram_overlap_count'] = train.apply(ngrams_co_occurence, args=(3,), axis = 1)

train['bigram_overlap_ratio'] = train['bigram_overlap_count'] / (train['text_len'] - 1)
train['trigram_overlap_ratio'] = train['trigram_overlap_count'] / (train['text_len'] - 2)

train.drop(['prompt_tokens', 'summary_tokens'], axis = 1, inplace = True)

### For the test dataset

In [16]:
test_summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_prompt = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')

student_ids = test_summaries['student_id'].values.tolist()

In [17]:
test = test_summaries.merge(test_prompt, on = 'prompt_id', how = 'inner')
test.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

In [18]:
test['text_len'] = test['text'].apply(lambda x: len(x.split()))
test['prompt_len'] = test['prompt_text'].apply(lambda x: len(x.split()))
test['length_ratio'] = test['text_len'] / train['prompt_len']

In [19]:
test['text'] = test['text'].apply(lambda x: preprocess_text(x))
test['prompt_question'] = test['prompt_question'].apply(lambda x: preprocess_text(x))
test['prompt_text'] = test['prompt_text'].apply(lambda x: preprocess_text(x))
test['prompt_title'] = test['prompt_title'].apply(lambda x: preprocess_text(x))

test['misspelled'] = test['text'].apply(lambda x: get_misspelled_count(x))

In [20]:
test['prompt_tokens'] = test['prompt_text'].apply(lambda x: word_tokenize(x))
test['summary_tokens'] = test['text'].apply(lambda x: word_tokenize(x))

test['word_overlap_count'] = test.apply(word_overlap_count, axis = 1)
test['bigram_overlap_count'] = test.apply(ngrams_co_occurence, args=(2,), axis = 1)
test['trigram_overlap_count'] = test.apply(ngrams_co_occurence, args=(3,), axis = 1)

test['bigram_overlap_ratio'] = test['bigram_overlap_count'] / (test['text_len'] - 1)
test['trigram_overlap_ratio'] = test['trigram_overlap_count'] / (test['text_len'] - 2)

test.drop(['prompt_tokens', 'summary_tokens'], axis = 1, inplace = True)

### Tokenization

In [21]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
sep = tokenizer.sep_token

In [23]:
train['inputs'] = train['prompt_question'] + sep + train['text']
test['inputs'] = test['prompt_question'] + sep + test['text']

In [24]:
# sanity check 

train.head()

Unnamed: 0,text,content,wording,prompt_question,prompt_title,prompt_text,text_len,prompt_len,length_ratio,misspelled,word_overlap_count,bigram_overlap_count,trigram_overlap_count,bigram_overlap_ratio,trigram_overlap_ratio,inputs
0,the third wave was an experimentto see how peo...,0.205683,0.380538,summarize how the third wave developed over su...,the third wave,background the third wave experiment took pl...,61,596,0.102349,2,15,7,2,0.116667,0.033898,summarize how the third wave developed over su...
1,the third wave developed rapidly because the ...,3.272894,3.219757,summarize how the third wave developed over su...,the third wave,background the third wave experiment took pl...,203,596,0.340604,13,27,27,8,0.133663,0.039801,summarize how the third wave developed over su...
2,the third wave only started as an experiment w...,0.205683,0.380538,summarize how the third wave developed over su...,the third wave,background the third wave experiment took pl...,60,596,0.100671,3,15,16,7,0.271186,0.12069,summarize how the third wave developed over su...
3,the experimen was orginally about how even whe...,0.567975,0.969062,summarize how the third wave developed over su...,the third wave,background the third wave experiment took pl...,76,596,0.127517,4,18,21,9,0.28,0.121622,summarize how the third wave developed over su...
4,the third wave developed so quickly due to the...,-0.910596,-0.081769,summarize how the third wave developed over su...,the third wave,background the third wave experiment took pl...,27,596,0.045302,2,9,6,1,0.230769,0.04,summarize how the third wave developed over su...


In [25]:
class TextDataset(Dataset):
    def __init__(self, texts, feature_cols, tokenizer, max_length, targets = None, is_train = False):
        self.texts = texts
        self.feature_cols = feature_cols
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train
        
        if self.is_train:
            self.targets = targets
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        if self.is_train:
            item = {'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                   'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float), 
                   'target': torch.tensor(self.targets[idx], dtype=torch.float)}
        else:
            item = {'input_ids': encoding['input_ids'].flatten(),
                   'attention_mask': encoding['attention_mask'].flatten(),
                   'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float)}
        
        return item

In [26]:
feature_cols = ['text_len', 'prompt_len', 'length_ratio', 'misspelled',
                'word_overlap_count', 'bigram_overlap_count', 'trigram_overlap_count',
               'bigram_overlap_ratio', 'trigram_overlap_ratio']
targets = ['content', 'wording']
feature_col_size = len(feature_cols)

The `MAX_LENGTH` feature is guided by my work in the [EDA notebook](https://github.com/Akorex/Natural-Language-Processing/blob/main/Kaggle%20Competitions/commonlit-comp-starter.ipynb).

In [27]:
train_dataset = TextDataset(texts = train['inputs'], 
                            feature_cols = train[feature_cols].values, 
                            tokenizer = tokenizer, 
                            max_length = MAX_LENGTH,
                            targets = train[targets].values,
                           is_train = True)

In [28]:
test_dataset = TextDataset(texts = test['inputs'],
                          feature_cols = test[feature_cols].values,
                          tokenizer = tokenizer,
                          max_length = MAX_LENGTH,
                          is_train = False)

In [29]:
# split the dataset to train and validation dataset

from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(train_dataset, test_size = 0.1, shuffle = True, random_state = 42)

In [30]:
# load the dataset using pytorch's dataloader tool

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE)
test_loader = DataLoader(test_dataset)

## Modelling

In [31]:
from transformers import AutoModelForSequenceClassification

In [32]:
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_col_size):
        """Instantiate a model that can fit on the dataset"""
        
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)
        self.numerics = nn.Linear(feature_col_size, 16)
        self.final_layer = nn.Linear(16 + num_labels, num_labels)
        
    def forward(self, input_ids, attention_mask, feature_cols):
        text_output = self.model(input_ids = input_ids, attention_mask = attention_mask)
        numerics = self.numerics(feature_cols)
        concat_features = torch.cat([text_output.logits, numerics], dim = 1)
        
        final_output = self.final_layer(concat_features)
        
        return final_output

In [33]:
model = CustomModel(model_name, num_labels, feature_col_size)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bi

In [34]:
# loss and optimizer

from torch.optim import Adam
from torch.nn import MSELoss


optimizer = Adam(model.parameters(), lr = learning_rate)
loss_function = MSELoss()

In [35]:
def train_step(train_loader):
    """The training loop for the dataset"""
    
    for step, batch in enumerate(train_loader):
        epochal_loss = 0
        steps = 0
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        targets = batch['target'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask, feature_cols)
        loss = loss_function(outputs, targets)
        epochal_loss += loss
        steps += 1
        loss.backward()
        
        optimizer.step()
        
        if step % 50 == 0:
            print(f"Epoch {epoch + 1} Step {step} Loss {loss.item()}")
            
    print(f"Epoch {epoch + 1} Train Loss: {epochal_loss/steps}")

In [36]:
def val_step(val_loader):
    """The validation loop"""
    
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            epochal_loss = 0
            steps = 0
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            feature_cols = batch['feature_cols'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, feature_cols)
            loss = loss_function(outputs, targets)
            epochal_loss += loss
            steps += 1
        
        print(f"Epoch {epoch + 1} Validation Loss: {epochal_loss/steps}")


In [37]:
import time

for epoch in range(EPOCHS):
    start = time.time()
    
    train_step(train_loader)
    val_step(val_loader)
    print(f"Total time for training epoch {epoch + 1}: {time.time() - start}s")
    print('\n')

Epoch 1 Step 0 Loss 1965.4530029296875
Epoch 1 Step 50 Loss 131.23826599121094
Epoch 1 Step 100 Loss 29.7383975982666
Epoch 1 Step 150 Loss 25.536453247070312
Epoch 1 Step 200 Loss 12.18255615234375
Epoch 1 Step 250 Loss 4.197801113128662
Epoch 1 Step 300 Loss 1.2913610935211182
Epoch 1 Train Loss: 1.9974167346954346
Epoch 1 Validation Loss: 2.6117513179779053
Total time for training epoch 1: 331.2814335823059s


Epoch 2 Step 0 Loss 1.9019527435302734
Epoch 2 Step 50 Loss 2.2115108966827393
Epoch 2 Step 100 Loss 3.3882617950439453
Epoch 2 Step 150 Loss 1.191575050354004
Epoch 2 Step 200 Loss 1.1557422876358032
Epoch 2 Step 250 Loss 0.6739975810050964
Epoch 2 Step 300 Loss 0.5298209190368652
Epoch 2 Train Loss: 0.5751464366912842
Epoch 2 Validation Loss: 0.6558099389076233
Total time for training epoch 2: 329.3616201877594s


Epoch 3 Step 0 Loss 0.7243967056274414
Epoch 3 Step 50 Loss 0.9273635745048523
Epoch 3 Step 100 Loss 0.9164897203445435
Epoch 3 Step 150 Loss 0.6905603408813477
Ep

In [38]:
preds = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        
        outputs = model(input_ids, attention_mask, feature_cols)
        preds.extend(outputs.cpu().numpy())

## Submission

In [39]:
submission = pd.DataFrame({
    'student_id': student_ids,
    'content': [pred[0] for pred in preds],
    'wording': [pred[1] for pred in preds]
})


In [40]:
cols_to_check = ['wording', 'content']
submission[cols_to_check] = submission[cols_to_check].applymap(lambda x: x if is_valid_float(x) else 0.0)

In [41]:
submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.366251,-1.537591
1,111111eeeeee,-1.366251,-1.537591
2,222222cccccc,-1.366251,-1.537591
3,333333dddddd,-1.366251,-1.537591


In [42]:
submission.to_csv('submission.csv', index = False)