In [1]:
#https://www.kaggle.com/tolgadincer/continuous-target-stratification
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import tensorflow_addons as tfa
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets

In [2]:
# Configurations
EPOCHS = 100#30#10#70#150
# Batch size
BATCH_SIZE = 16#24
# Seed
SEED = 123
# Learning rate
LR = 2e-5 #0.000040
# Weight decay
WD = LR/EPOCHS
epsilon=1e-6

# Verbosity
VERBOSE = 2
# Number of folds for training
FOLDS = 5
#groups
n_grp=len(pd.read_csv('../input/commonlitreadabilityprize/train.csv'))
path_aug = '../input/data-aug-commonlit-readability-lang-translation/augmented_data.csv'
# Max length
MAX_LEN = 256 #300#250

# Get the trained model we want to use
MODEL = 'roberta-base'

# Let's load our model tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
# def create_folds(df, n_s=5, n_grp=None):
#     df['Fold'] = -1
    
#     if n_grp is None:
#         skf = KFold(n_splits=n_s)
#         target = df.target
#     else:
#         skf = StratifiedKFold(n_splits=n_s)
#         df['grp'] = pd.cut(df.target, n_grp, labels=False)
#         target = df.grp
    
#     for fold_no, (t, v) in enumerate(skf.split(target, target)):
#         df.loc[v, 'Fold'] = fold_no
#     return df

In [4]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# This function tokenize the text according to a transformers model tokenizer
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])

# This function encode our training sentences
def encode_texts(x_train, x_val, MAX_LEN):
    x_train = regular_encode(x_train.tolist(), tokenizer, maxlen = MAX_LEN)
    x_val = regular_encode(x_val.tolist(), tokenizer, maxlen = MAX_LEN)
    return x_train, x_val

# Function to transform arrays to tensors
def transform_to_tensors(x_train, x_val, y_train, y_val):
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_val, y_val))
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    return train_dataset, valid_dataset

# Function to build our model
def build_roberta_base_model(max_len = MAX_LEN):
    transformer = TFRobertaModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    model.compile(#optimizer = tf.keras.optimizers.Adam(lr = LR),#tfa.optimizers.AdamW(learning_rate = LR),#tf.keras.optimizers.Adam(lr = LR),
        optimizer = tfa.optimizers.AdamW(learning_rate = LR, weight_decay=WD, epsilon= epsilon),          
        loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

# Function to train and evaluate our model
def train_and_evaluate():
    
    # Read our training data
    df = pd.read_csv(path_aug)
    df['target'] = df['final_target']
    df['excerpt'] = df['final_excerpt']
    # Seed everything
    seed_everything(SEED)
    
    # Initiate kfold object with shuffle and a specific seed
#     kfold = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    kfold = StratifiedKFold(n_splits=FOLDS, shuffle = True, random_state = SEED)
    df['grp'] = pd.cut(df.target, n_grp, labels=False)
    target = df.grp
    # Create out of folds array to store predictions
    oof_predictions = np.zeros(len(df))
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(df,target)):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        K.clear_session()
        # Get text features and target
        x_train, x_val = df['excerpt'].iloc[trn_ind], df['excerpt'].iloc[val_ind]
        y_train, y_val = df['target'].iloc[trn_ind].values, df['target'].iloc[val_ind].values
        # Encode our text with Roberta tokenizer
        x_train, x_val = encode_texts(x_train, x_val, MAX_LEN)
        # Function to transform our numpy array to a tf Dataset
        train_dataset, valid_dataset = transform_to_tensors(x_train, x_val, y_train, y_val)
        # Build model
        model = build_roberta_base_model(max_len = MAX_LEN)
        # Model checkpoint
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'Roberta_Base_{SEED}_{fold + 1}.h5', 
                                                        monitor = 'val_root_mean_squared_error', 
                                                        verbose = VERBOSE, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'min')
        steps = x_train.shape[0] // (BATCH_SIZE * 16)
        # Training phase
        history = model.fit(train_dataset,
                            batch_size = BATCH_SIZE,
                            epochs = EPOCHS,
                            verbose = VERBOSE,
                            callbacks = [checkpoint],
                            validation_data = valid_dataset,
                            steps_per_epoch = steps)
        
        
        # Load best epoch weights
        model.load_weights(f'Roberta_Base_{SEED}_{fold + 1}.h5')
        # Predict validation set to save them in the out of folds array
        val_pred = model.predict(valid_dataset)
        oof_predictions[val_ind] = val_pred.reshape(-1)
        
    print('\n')
    print('-'*50)
    # Calculate out of folds root mean squared error
    oof_rmse = np.sqrt(mean_squared_error(df['target'], oof_predictions))
    print(f'Our out of folds RMSE is {oof_rmse}')
    

train_and_evaluate()





--------------------------------------------------
Training fold 1


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/100
17/17 - 45s - loss: 1.2928 - root_mean_squared_error: 1.1370 - val_loss: 0.8557 - val_root_mean_squared_error: 0.9251

Epoch 00001: val_root_mean_squared_error improved from inf to 0.92506, saving model to Roberta_Base_123_1.h5
Epoch 2/100
17/17 - 21s - loss: 0.7077 - root_mean_squared_error: 0.8412 - val_loss: 0.5356 - val_root_mean_squared_error: 0.7318

Epoch 00002: val_root_mean_squared_error improved from 0.92506 to 0.73182, saving model to Roberta_Base_123_1.h5
Epoch 3/100
17/17 - 21s - loss: 0.7603 - root_mean_squared_error: 0.8719 - val_loss: 0.5667 - val_root_mean_squared_error: 0.7528

Epoch 00003: val_root_mean_squared_error did not improve from 0.73182
Epoch 4/100
17/17 - 21s - loss: 0.4723 - root_mean_squared_error: 0.6872 - val_loss: 0.3697 - val_root_mean_squared_error: 0.6080

Epoch 00004: val_root_mean_squared_error improved from 0.73182 to 0.60800, saving model to Roberta_Base_123_1.h5
Epoch 5/100
17/17 - 21s - loss: 0.7272 - root_mean_squared_error: 0.852

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/100
17/17 - 43s - loss: 1.2024 - root_mean_squared_error: 1.0966 - val_loss: 0.9997 - val_root_mean_squared_error: 0.9998

Epoch 00001: val_root_mean_squared_error improved from inf to 0.99984, saving model to Roberta_Base_123_2.h5
Epoch 2/100
17/17 - 21s - loss: 0.9272 - root_mean_squared_error: 0.9629 - val_loss: 0.5483 - val_root_mean_squared_error: 0.7405

Epoch 00002: val_root_mean_squared_error improved from 0.99984 to 0.74051, saving model to Roberta_Base_123_2.h5
Epoch 3/100
17/17 - 21s - loss: 0.5725 - root_mean_squared_error: 0.7567 - val_loss: 0.5671 - val_root_mean_squared_error: 0.7531

Epoch 00003: val_root_mean_squared_error did not improve from 0.74051
Epoch 4/100
17/17 - 21s - loss: 0.4987 - root_mean_squared_error: 0.7062 - val_loss: 0.4256 - val_root_mean_squared_error: 0.6524

Epoch 00004: val_root_mean_squared_error improved from 0.74051 to 0.65240, saving model to Roberta_Base_123_2.h5
Epoch 5/100
17/17 - 21s - loss: 0.4967 - root_mean_squared_error: 0.704

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/100
17/17 - 44s - loss: 1.3246 - root_mean_squared_error: 1.1509 - val_loss: 0.8830 - val_root_mean_squared_error: 0.9397

Epoch 00001: val_root_mean_squared_error improved from inf to 0.93969, saving model to Roberta_Base_123_3.h5
Epoch 2/100
17/17 - 21s - loss: 1.0052 - root_mean_squared_error: 1.0026 - val_loss: 0.7483 - val_root_mean_squared_error: 0.8650

Epoch 00002: val_root_mean_squared_error improved from 0.93969 to 0.86502, saving model to Roberta_Base_123_3.h5
Epoch 3/100
17/17 - 21s - loss: 0.7434 - root_mean_squared_error: 0.8622 - val_loss: 0.6538 - val_root_mean_squared_error: 0.8086

Epoch 00003: val_root_mean_squared_error improved from 0.86502 to 0.80858, saving model to Roberta_Base_123_3.h5
Epoch 4/100
17/17 - 21s - loss: 0.5087 - root_mean_squared_error: 0.7132 - val_loss: 0.4286 - val_root_mean_squared_error: 0.6547

Epoch 00004: val_root_mean_squared_error improved from 0.80858 to 0.65469, saving model to Roberta_Base_123_3.h5
Epoch 5/100
17/17 - 21s - lo

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/100
17/17 - 43s - loss: 1.2461 - root_mean_squared_error: 1.1163 - val_loss: 0.9610 - val_root_mean_squared_error: 0.9803

Epoch 00001: val_root_mean_squared_error improved from inf to 0.98032, saving model to Roberta_Base_123_4.h5
Epoch 2/100
17/17 - 21s - loss: 0.8566 - root_mean_squared_error: 0.9255 - val_loss: 0.7007 - val_root_mean_squared_error: 0.8371

Epoch 00002: val_root_mean_squared_error improved from 0.98032 to 0.83706, saving model to Roberta_Base_123_4.h5
Epoch 3/100
17/17 - 21s - loss: 0.6123 - root_mean_squared_error: 0.7825 - val_loss: 0.4719 - val_root_mean_squared_error: 0.6870

Epoch 00003: val_root_mean_squared_error improved from 0.83706 to 0.68696, saving model to Roberta_Base_123_4.h5
Epoch 4/100
17/17 - 21s - loss: 0.6079 - root_mean_squared_error: 0.7797 - val_loss: 0.5706 - val_root_mean_squared_error: 0.7554

Epoch 00004: val_root_mean_squared_error did not improve from 0.68696
Epoch 5/100
17/17 - 21s - loss: 0.4379 - root_mean_squared_error: 0.661

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/100
17/17 - 44s - loss: 1.4132 - root_mean_squared_error: 1.1888 - val_loss: 1.1429 - val_root_mean_squared_error: 1.0691

Epoch 00001: val_root_mean_squared_error improved from inf to 1.06906, saving model to Roberta_Base_123_5.h5
Epoch 2/100
17/17 - 21s - loss: 0.7601 - root_mean_squared_error: 0.8718 - val_loss: 0.6403 - val_root_mean_squared_error: 0.8002

Epoch 00002: val_root_mean_squared_error improved from 1.06906 to 0.80019, saving model to Roberta_Base_123_5.h5
Epoch 3/100
17/17 - 21s - loss: 0.7425 - root_mean_squared_error: 0.8617 - val_loss: 0.5419 - val_root_mean_squared_error: 0.7361

Epoch 00003: val_root_mean_squared_error improved from 0.80019 to 0.73612, saving model to Roberta_Base_123_5.h5
Epoch 4/100
17/17 - 21s - loss: 0.7968 - root_mean_squared_error: 0.8926 - val_loss: 0.5073 - val_root_mean_squared_error: 0.7122

Epoch 00004: val_root_mean_squared_error improved from 0.73612 to 0.71225, saving model to Roberta_Base_123_5.h5
Epoch 5/100
17/17 - 21s - lo