The idea of this notebook is to improve the score obtained from [Experiment 1](https://github.com/Akorex/Natural-Language-Processing/blob/main/Kaggle%20Competitions/commonlit-experiment-1.ipynb) and [the Baseline notebook](https://www.kaggle.com/code/adewoleakorede/baseline-submission) on the Kaggle board. Ideas in this notebook is majorly inspired by earlier work and this [Kaggle notebook](https://www.kaggle.com/code/siddhvr/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned)

In [1]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch 
from torch import nn
import re
import nltk
from nltk.corpus import stopwords
from torch.utils.data import Dataset
from spellchecker import SpellChecker



In [3]:
summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')

summaries.shape, prompts.shape

((7165, 5), (4, 4))

## Config

In [4]:
# parameters for training

EPOCHS = 5
model_name = '/kaggle/input/bert-base-uncased'
num_labels = 2
learning_rate = 0.001
BATCH_SIZE = 20
MAX_LENGTH = 256
NUM_FOLDS = 5

# If there are GPUs available, use the first one 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Feature Engineering

In [5]:
STOP_WORDS = set(stopwords.words('english'))

spellchecker = SpellChecker()

### Utility Functions

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    return text

def is_valid_float(x):
    return isinstance(x, float) and x == x  # This checks that x is not NaN since NaN != NaN in Python.


def get_misspelled_count(text):
    tokens = nltk.word_tokenize(text)
    misspelled = [token for token in spellchecker.unknown(tokens) if token.isalpha()]
    
    return len(misspelled)

In [7]:
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

def ngrams(token, n):
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def ngrams_co_occurence(row, n):
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']
    
    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))
    
    common_ngrams = original_ngrams.intersection(summary_ngrams)
    return len(common_ngrams)

In [8]:
def check_is_stop_word(word):
    return word in STOP_WORDS

In [9]:
def word_overlap_count(row):
    """ intersection(prompt_text, text) """    
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    if STOP_WORDS:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    return len(set(prompt_words).intersection(set(summary_words)))

### For the train dataset

In [10]:
train = summaries.merge(prompts, on = 'prompt_id', how = 'inner')
train.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

train['text_len'] = train['text'].apply(lambda x: len(x.split()))
train['prompt_len'] = train['prompt_text'].apply(lambda x: len(x.split()))
train['length_ratio'] = train['text_len'] / train['prompt_len']

In [11]:
train['text'] = train['text'].apply(lambda x: preprocess_text(x))
train['prompt_question'] = train['prompt_question'].apply(lambda x: preprocess_text(x))
train['prompt_text'] = train['prompt_text'].apply(lambda x: preprocess_text(x))
train['prompt_title'] = train['prompt_title'].apply(lambda x: preprocess_text(x))

train['misspelled'] = train['text'].apply(lambda x: get_misspelled_count(x))

In [12]:
train['prompt_tokens'] = train['prompt_text'].apply(lambda x: word_tokenize(x))
train['summary_tokens'] = train['text'].apply(lambda x: word_tokenize(x))

train['word_overlap_count'] = train.apply(word_overlap_count, axis = 1)
train['bigram_overlap_count'] = train.apply(ngrams_co_occurence, args=(2,), axis = 1)
train['trigram_overlap_count'] = train.apply(ngrams_co_occurence, args=(3,), axis = 1)

train['bigram_overlap_ratio'] = train['bigram_overlap_count'] / (train['text_len'] - 1)
train['trigram_overlap_ratio'] = train['trigram_overlap_count'] / (train['text_len'] - 2)

train.drop(['prompt_tokens', 'summary_tokens'], axis = 1, inplace = True)

### For the test dataset

In [13]:
test_summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_prompt = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')

student_ids = test_summaries['student_id'].values.tolist()

test = test_summaries.merge(test_prompt, on = 'prompt_id', how = 'inner')
test.drop(['student_id', 'prompt_id'], axis = 1, inplace = True)

In [14]:
test['text_len'] = test['text'].apply(lambda x: len(x.split()))
test['prompt_len'] = test['prompt_text'].apply(lambda x: len(x.split()))
test['length_ratio'] = test['text_len'] / train['prompt_len']

test['text'] = test['text'].apply(lambda x: preprocess_text(x))
test['prompt_question'] = test['prompt_question'].apply(lambda x: preprocess_text(x))
test['prompt_text'] = test['prompt_text'].apply(lambda x: preprocess_text(x))
test['prompt_title'] = test['prompt_title'].apply(lambda x: preprocess_text(x))

test['misspelled'] = test['text'].apply(lambda x: get_misspelled_count(x))

In [15]:
test['prompt_tokens'] = test['prompt_text'].apply(lambda x: word_tokenize(x))
test['summary_tokens'] = test['text'].apply(lambda x: word_tokenize(x))

test['word_overlap_count'] = test.apply(word_overlap_count, axis = 1)
test['bigram_overlap_count'] = test.apply(ngrams_co_occurence, args=(2,), axis = 1)
test['trigram_overlap_count'] = test.apply(ngrams_co_occurence, args=(3,), axis = 1)

test['bigram_overlap_ratio'] = test['bigram_overlap_count'] / (test['text_len'] - 1)
test['trigram_overlap_ratio'] = test['trigram_overlap_count'] / (test['text_len'] - 2)

test.drop(['prompt_tokens', 'summary_tokens'], axis = 1, inplace = True)

### Tokenization

In [16]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
sep = tokenizer.sep_token

train['inputs'] = train['prompt_question'] + sep + train['text']
test['inputs'] = test['prompt_question'] + sep + test['text']

## Loading the dataset

In [18]:
class TextDataset(Dataset):
    def __init__(self, texts, feature_cols, tokenizer, max_length, targets = None, is_train = False):
        self.texts = texts
        self.feature_cols = feature_cols
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train
        
        if self.is_train:
            self.targets = targets
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        encoding = self.tokenizer.encode_plus(text,
                                             add_special_tokens = True,
                                             max_length = self.max_length,
                                             return_token_type_ids = False,
                                             padding = 'max_length',
                                             truncation = True,
                                             return_attention_mask = True,
                                             return_tensors = 'pt')
        if self.is_train:
            item = {'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                   'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float), 
                   'target': torch.tensor(self.targets[idx], dtype=torch.float)}
        else:
            item = {'input_ids': encoding['input_ids'].flatten(),
                   'attention_mask': encoding['attention_mask'].flatten(),
                   'feature_cols': torch.tensor(self.feature_cols[idx], dtype=torch.float)}
        
        return item

In [19]:
feature_cols = ['text_len', 'prompt_len', 'length_ratio', 'misspelled',
                'word_overlap_count', 'bigram_overlap_count', 'trigram_overlap_count',
               'bigram_overlap_ratio', 'trigram_overlap_ratio']

targets = ['content', 'wording']
feature_col_size = len(feature_cols)

In [20]:
train_dataset = TextDataset(texts = train['inputs'], 
                            feature_cols = train[feature_cols].values, 
                            tokenizer = tokenizer, 
                            max_length = MAX_LENGTH,
                            targets = train[targets].values,
                           is_train = True)

test_dataset = TextDataset(texts = test['inputs'],
                          feature_cols = test[feature_cols].values,
                          tokenizer = tokenizer,
                          max_length = MAX_LENGTH,
                          is_train = False)

In [21]:
# load the dataset using pytorch's dataloader tool
from torch.utils.data import DataLoader

test_loader = DataLoader(test_dataset)

### Modelling

### LLM Regressor

I have decided to add cross-validation strategy to the training process of the model from Experiment 1

In [22]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import KFold

In [23]:
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_col_size):
        """Instantiate a model that can fit on the dataset"""
        
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)
        self.numerics = nn.Linear(feature_col_size, 16)
        self.final_layer = nn.Linear(16 + num_labels, num_labels)
        
    def forward(self, input_ids, attention_mask, feature_cols):
        text_output = self.model(input_ids = input_ids, attention_mask = attention_mask)
        numerics = self.numerics(feature_cols)
        concat_features = torch.cat([text_output.logits, numerics], dim = 1)
        
        final_output = self.final_layer(concat_features)
        
        return final_output

In [24]:
# loss and optimizer

from torch.optim import Adam
from torch.nn import MSELoss

loss_function = MSELoss()

In [25]:
def train_step(train_loader):
    """The training loop for the dataset"""
    
    for step, batch in enumerate(train_loader):
        epochal_loss = 0
        steps = 0
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        targets = batch['target'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask, feature_cols)
        loss = loss_function(outputs, targets)
        epochal_loss += loss
        steps += 1
        
        loss.backward()
        optimizer.step()
        
        if step % 50 == 0:
            print(f"Epoch {epoch + 1} Step {step} Loss {loss.item()}")
            
    print(f"Epoch {epoch + 1} Train Loss: {epochal_loss/steps}")

In [26]:
def val_step(val_loader):
    """The validation loop"""
    
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            epochal_loss = 0
            steps = 0
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            feature_cols = batch['feature_cols'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, feature_cols)
            epochal_loss += loss_function(outputs, targets)
            steps += 1
            
        avg_val_loss = epochal_loss/steps
        print(f"Epoch {epoch + 1} Validation Loss: {avg_val_loss}")
        
    return avg_val_loss

In [27]:
import time

best_model = None
best_val_loss = float('inf')
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

for fold, (train_index, val_index) in enumerate(kf.split(train_dataset)):
    print(f"Fold {fold + 1} - Train Indices: {train_index}, Val Indices: {val_index}")
    train_subset = torch.utils.data.Subset(train_dataset, train_index)
    val_subset = torch.utils.data.Subset(train_dataset, val_index)
    
    train_loader = DataLoader(train_subset, batch_size = BATCH_SIZE, shuffle = True)
    val_loader = DataLoader(val_subset, batch_size = BATCH_SIZE)
    
    model = CustomModel(model_name, num_labels, feature_col_size)
    model.to(device)
    
    optimizer = Adam(model.parameters(), lr = learning_rate)
    
    for epoch in range(EPOCHS):
        start = time.time()
        
        train_step(train_loader = train_loader)
        avg_val_loss = val_step(val_loader = val_loader)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()
        
        print(f"Total time for training epoch {epoch + 1}: {time.time() - start}s")
        print('\n')

Fold 1 - Train Indices: [   0    1    2 ... 7161 7163 7164], Val Indices: [   8   14   17 ... 7157 7159 7162]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Step 0 Loss 6244.92529296875
Epoch 1 Step 50 Loss 37.08399200439453
Epoch 1 Step 100 Loss 10.058191299438477
Epoch 1 Step 150 Loss 6.962139129638672
Epoch 1 Step 200 Loss 7.272191047668457
Epoch 1 Step 250 Loss 10.922537803649902
Epoch 1 Train Loss: 6.830264568328857
Epoch 1 Validation Loss: 2.7088258266448975
Total time for training epoch 1: 149.25971484184265s


Epoch 2 Step 0 Loss 6.016249179840088
Epoch 2 Step 50 Loss 6.373027324676514
Epoch 2 Step 100 Loss 4.445347785949707
Epoch 2 Step 150 Loss 2.167292356491089
Epoch 2 Step 200 Loss 2.2981863021850586
Epoch 2 Step 250 Loss 1.6112492084503174
Epoch 2 Train Loss: 1.6405367851257324
Epoch 2 Validation Loss: 1.0973751544952393
Total time for training epoch 2: 147.86746335029602s


Epoch 3 Step 0 Loss 2.4855878353118896
Epoch 3 Step 50 Loss 1.2881046533584595
Epoch 3 Step 100 Loss 0.9404897689819336
Epoch 3 Step 150 Loss 1.4881151914596558
Epoch 3 Step 200 Loss 1.0762569904327393
Epoch 3 Step 250 Loss 1.1293630599975586
Epoch

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Step 0 Loss 11569.0869140625
Epoch 1 Step 50 Loss 239.02114868164062
Epoch 1 Step 100 Loss 55.7779655456543
Epoch 1 Step 150 Loss 21.54164695739746
Epoch 1 Step 200 Loss 23.619998931884766
Epoch 1 Step 250 Loss 9.282065391540527
Epoch 1 Train Loss: 9.125484466552734
Epoch 1 Validation Loss: 1.0876491069793701
Total time for training epoch 1: 148.93122625350952s


Epoch 2 Step 0 Loss 13.822662353515625
Epoch 2 Step 50 Loss 7.173825263977051
Epoch 2 Step 100 Loss 7.3341240882873535
Epoch 2 Step 150 Loss 4.512558937072754
Epoch 2 Step 200 Loss 3.654104232788086
Epoch 2 Step 250 Loss 2.5600335597991943
Epoch 2 Train Loss: 2.165461540222168
Epoch 2 Validation Loss: 0.6472029089927673
Total time for training epoch 2: 148.9727008342743s


Epoch 3 Step 0 Loss 2.2703800201416016
Epoch 3 Step 50 Loss 1.7765941619873047
Epoch 3 Step 100 Loss 1.259765386581421
Epoch 3 Step 150 Loss 1.2509292364120483
Epoch 3 Step 200 Loss 1.6565626859664917
Epoch 3 Step 250 Loss 4.878007411956787
Epoch 3 T

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Step 0 Loss 705.8566284179688
Epoch 1 Step 50 Loss 4.936731338500977
Epoch 1 Step 100 Loss 1.4909604787826538
Epoch 1 Step 150 Loss 0.4773823320865631
Epoch 1 Step 200 Loss 0.7201003432273865
Epoch 1 Step 250 Loss 0.614048182964325
Epoch 1 Train Loss: 0.7321339845657349
Epoch 1 Validation Loss: 0.711833119392395
Total time for training epoch 1: 149.4302523136139s


Epoch 2 Step 0 Loss 0.5137926936149597
Epoch 2 Step 50 Loss 0.4345259368419647
Epoch 2 Step 100 Loss 0.3886605203151703
Epoch 2 Step 150 Loss 0.36364051699638367
Epoch 2 Step 200 Loss 0.5983284115791321
Epoch 2 Step 250 Loss 0.3910214602947235
Epoch 2 Train Loss: 0.38537728786468506
Epoch 2 Validation Loss: 0.6030328869819641
Total time for training epoch 2: 149.42120361328125s


Epoch 3 Step 0 Loss 0.4606975018978119
Epoch 3 Step 50 Loss 0.372419536113739
Epoch 3 Step 100 Loss 0.45793938636779785
Epoch 3 Step 150 Loss 0.48611971735954285
Epoch 3 Step 200 Loss 0.40141841769218445
Epoch 3 Step 250 Loss 0.3297095298767

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Step 0 Loss 18067.51953125
Epoch 1 Step 50 Loss 358.7878112792969
Epoch 1 Step 100 Loss 94.67247772216797
Epoch 1 Step 150 Loss 69.97544860839844
Epoch 1 Step 200 Loss 15.170097351074219
Epoch 1 Step 250 Loss 22.52130126953125
Epoch 1 Train Loss: 22.859966278076172
Epoch 1 Validation Loss: 6.443268299102783
Total time for training epoch 1: 149.32205367088318s


Epoch 2 Step 0 Loss 18.337970733642578
Epoch 2 Step 50 Loss 12.207226753234863
Epoch 2 Step 100 Loss 12.697649002075195
Epoch 2 Step 150 Loss 10.027022361755371
Epoch 2 Step 200 Loss 10.943381309509277
Epoch 2 Step 250 Loss 7.662580966949463
Epoch 2 Train Loss: 4.088892936706543
Epoch 2 Validation Loss: 2.4055023193359375
Total time for training epoch 2: 149.23307490348816s


Epoch 3 Step 0 Loss 3.822578191757202
Epoch 3 Step 50 Loss 4.33130407333374
Epoch 3 Step 100 Loss 2.7564494609832764
Epoch 3 Step 150 Loss 2.3577280044555664
Epoch 3 Step 200 Loss 1.269821047782898
Epoch 3 Step 250 Loss 1.292689323425293
Epoch 3 Tra

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Step 0 Loss 1881.94921875
Epoch 1 Step 50 Loss 66.45091247558594
Epoch 1 Step 100 Loss 12.326605796813965
Epoch 1 Step 150 Loss 4.466763973236084
Epoch 1 Step 200 Loss 3.495629072189331
Epoch 1 Step 250 Loss 2.1115715503692627
Epoch 1 Train Loss: 1.2123820781707764
Epoch 1 Validation Loss: 1.5921350717544556
Total time for training epoch 1: 148.8720920085907s


Epoch 2 Step 0 Loss 1.7758249044418335
Epoch 2 Step 50 Loss 1.6658114194869995
Epoch 2 Step 100 Loss 1.7134370803833008
Epoch 2 Step 150 Loss 0.839093804359436
Epoch 2 Step 200 Loss 0.7574621438980103
Epoch 2 Step 250 Loss 0.6621996164321899
Epoch 2 Train Loss: 0.6009676456451416
Epoch 2 Validation Loss: 0.5863232612609863
Total time for training epoch 2: 149.14570236206055s


Epoch 3 Step 0 Loss 0.7721171379089355
Epoch 3 Step 50 Loss 0.9878345727920532
Epoch 3 Step 100 Loss 0.4535995423793793
Epoch 3 Step 150 Loss 0.43723687529563904
Epoch 3 Step 200 Loss 0.48605093359947205
Epoch 3 Step 250 Loss 0.5418254137039185
Epo

### Predictions

In [28]:
# Create an instance of your CustomModel
model = CustomModel(model_name, num_labels, feature_col_size)
model.to(device)


model.load_state_dict(best_model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [29]:
preds = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        feature_cols = batch['feature_cols'].to(device)
        
        outputs = model(input_ids, attention_mask, feature_cols)
        preds.extend(outputs.cpu().numpy())

In [30]:
submission_llm = pd.DataFrame({
    'student_id': student_ids,
    'content_5': [pred[0] for pred in preds],
    'wording_5': [pred[1] for pred in preds]
})

cols_to_check = ['wording_5', 'content_5']
submission_llm[cols_to_check] = submission_llm[cols_to_check].applymap(lambda x: x if is_valid_float(x) else 0.0)

In [31]:
submission_llm.head()

Unnamed: 0,student_id,content_5,wording_5
0,000000ffffff,0.139513,-1.091022
1,111111eeeeee,0.139513,-1.091022
2,222222cccccc,0.139513,-1.091022
3,333333dddddd,0.139513,-1.091022


### LGBM Model

We can also train an LGBM model on the numerical features on the dataset to make predictions. Before continuing, let's split the given dataset into train and validation

In [32]:
feature_cols = ['text_len', 'prompt_len', 'length_ratio', 'misspelled',
                'word_overlap_count', 'bigram_overlap_count', 'trigram_overlap_count',
               'bigram_overlap_ratio', 'trigram_overlap_ratio']

targets = ['content', 'wording']

In [33]:
X_test = test[feature_cols]

In [34]:
params = {
    'verbose': -1,
    'boosting_type': 'gbdt',
    'random_state': 42,
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.048,
    'max_depth': 4,  #3
    'lambda_l1': 0.0,
    'lambda_l2': 0.011
}

In [35]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
from collections import OrderedDict

predictions_dict = OrderedDict()

for fold, (train_index, val_index) in enumerate(kf.split(train)):
    print(f"\nFold {fold + 1} - Train Indices: {train_index}, Val Indices: {val_index}")
    train_cv = train.iloc[train_index]
    val_cv = train.iloc[val_index]
    
    for target in targets:
        predictions = []
        print(f"\nTraining for {target}")
        X_train_cv = train_cv[feature_cols]
        X_val_cv = val_cv[feature_cols]
        
        y_train_cv = train_cv[target]
        y_val_cv = val_cv[target]
        
        dtrain = lgb.Dataset(X_train_cv, label = y_train_cv)
        dval = lgb.Dataset(X_val_cv, label = y_val_cv)
        
        evaluation_results = {}
        model = lgb.train(params, num_boost_round=10000, valid_names=['train', 'valid'],
                  train_set=dtrain,valid_sets=dval,
                  callbacks=[lgb.early_stopping(stopping_rounds=30, verbose=True),lgb.log_evaluation(100),
                             lgb.callback.record_evaluation(evaluation_results)],)
        pred = model.predict(X_test)
        predictions.extend(pred)
        predictions_dict[f"{target}_{fold}"] = predictions


Fold 1 - Train Indices: [   0    1    2 ... 7161 7163 7164], Val Indices: [   8   14   17 ... 7157 7159 7162]

Training for content
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.446375
Early stopping, best iteration is:
[125]	train's rmse: 0.445928

Training for wording
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.59066
Early stopping, best iteration is:
[143]	train's rmse: 0.589227

Fold 2 - Train Indices: [   0    1    2 ... 7162 7163 7164], Val Indices: [  12   15   26 ... 7152 7160 7161]

Training for content
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.445534
[200]	train's rmse: 0.443594
Early stopping, best iteration is:
[195]	train's rmse: 0.443439

Training for wording
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.6071
[200]	train's rmse: 0.601333
Early stopping, best iteration is:
[256]	train's rmse: 0.60005

Fold 3 - Train Indi

In [36]:
submission_lgbm = pd.DataFrame(predictions_dict)
submission_lgbm.head()

Unnamed: 0,content_0,wording_0,content_1,wording_1,content_2,wording_2,content_3,wording_3,content_4,wording_4
0,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326
1,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326
2,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326
3,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326


## Submission

In [37]:
submission = pd.concat([submission_lgbm, submission_llm], axis = 1)

submission.head()

Unnamed: 0,content_0,wording_0,content_1,wording_1,content_2,wording_2,content_3,wording_3,content_4,wording_4,student_id,content_5,wording_5
0,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326,000000ffffff,0.139513,-1.091022
1,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326,111111eeeeee,0.139513,-1.091022
2,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326,222222cccccc,0.139513,-1.091022
3,-1.396739,-0.900027,-1.362828,-0.988698,-1.406861,-0.855387,-1.351256,-1.132858,-1.368172,-0.991326,333333dddddd,0.139513,-1.091022


In [38]:
content_keys = ['content_0', 'content_1', 'content_2', 'content_3', 'content_4', 'content_5']
wording_keys = ['wording_0', 'wording_1', 'wording_2', 'wording_3', 'wording_4', 'wording_5']

In [39]:
submission['content'] = submission[content_keys].mean(axis=1)
submission['wording'] = submission[wording_keys].mean(axis = 1)

submission.drop(columns = content_keys, axis = 1, inplace = True)
submission.drop(columns = wording_keys, axis = 1, inplace = True)

submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.124391,-0.99322
1,111111eeeeee,-1.124391,-0.99322
2,222222cccccc,-1.124391,-0.99322
3,333333dddddd,-1.124391,-0.99322


In [40]:
submission.to_csv('submission.csv', index = False)