In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_text as text
import torch
import spacy
from tqdm import tqdm
from spacy.tokens import Doc
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import re
from autocorrect import Speller
from string import punctuation as punc
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from textblob import TextBlob
import math
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras import backend as K
from transformers import AutoTokenizer,TFAutoModel, pipeline
from scipy.special import softmax
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

class NLTKTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        # Tokenize the text using nltk's word_tokenize
        words = word_tokenize(text)
        
        # Convert the list of words to a spaCy Doc object
        return Doc(self.vocab, words=words)


nlp = spacy.load("en_core_web_lg")
# Set spaCy's tokenizer to the custom tokenizer
nlp.tokenizer = NLTKTokenizer(nlp.vocab)
checker = SpellChecker()
speller = Speller(lang='en')
english = set(words.words())

In [None]:
summary_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompt_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
test_summary = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')

In [None]:
SAVED_MODEL_DIR = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_DIR)
model = TFAutoModel.from_pretrained(SAVED_MODEL_DIR,from_pt=True)

# fe = pipeline('feature-extraction', tokenizer=tokenizer,model=model,torch_dtype=torch.float16,device=0)

In [None]:
def extract_lemmas(text):
    doc = nlp(text)
    lemmas = []

    for token in doc:
        if not token.is_stop:
            lemmas.append(token.lemma_.lower())
    return ' '.join(lemmas)


def count_punc(text):
    return len([p for p in text if p in punc])

# Overlap Score

def tree_depth_from_token(token):
    """Calculate the depth of a subtree rooted at a token."""
    if not list(token.children):
        return 1
    else:
        return 1 + max(tree_depth_from_token(child) for child in token.children)

def average_dependency_tree_depth(doc):
    """Compute the average depth of the dependency tree."""
    root_tokens = [tok for tok in doc if tok.dep_ == 'ROOT']
    if not root_tokens:
        return 0
    depths = [tree_depth_from_token(token) for token in root_tokens]
    return sum(depths) / len(depths)

def process_text(doc):
    # Using collections.Counter to count the occurrences efficiently
    pos_counts = Counter([token.pos_ for token in doc if not token.is_stop])

    # Extract counts directly using the dictionary
    num_nouns = pos_counts.get("NOUN", 0)
    num_verbs = pos_counts.get("VERB", 0)
    num_adverbs = pos_counts.get("ADV", 0)
    stops = sum(1 for token in doc if token.is_stop)

    return stops, num_nouns, num_verbs, num_adverbs


def syllable_count(word):
    word = word.lower()
    vowels = set("aeiouy")
    count = sum(1 for idx, char in enumerate(word) 
                if char in vowels and (idx == 0 or word[idx-1] not in vowels))

    # Decrease count if word ends with an 'e' but is not preceded by a vowel
    if word.endswith("e") and (len(word) == 1 or word[-2] not in vowels):
        count -= 1

    # Ensure at least one syllable
    count = max(1, count)
    
    return count

def extract_features(row, docs, context_docs):
    idx = row.name
    doc = docs[idx]
    context = context_docs[idx]
    
    # Word Tokenize
    tokens = [tok.text.lower() for tok in doc]
    context_tokens = [tok.text.lower() for tok in context]
    
    # Capital Error
    capital_error = sum(1 for sent in doc.sents if not sent.text[0].isupper())
    
    # 1. Length of the summary
    length = len(tokens)
    
    # 2. Number of unique words
    unique_words = len(set(tokens))
    
    #3. counter
    total_tokens = len(tokens) if length > 0 else 1
    word_counts = Counter(tokens)
    once = sum(1 for word, count in word_counts.items() if count == 1)
    twice = sum(1 for word, count in word_counts.items() if count == 2)
    
    # 4. Named entities
    entities = set(ent.text.lower() for ent in doc.ents)
    context_entities = set(ent.text.lower() for ent in context.ents)
    num_entities = len(entities)
    context_num_entities = len(context_entities)
    
    # Calculate the overlap between the two sets
    overlap_entities = context_entities.intersection(entities)
    
    # Calculate the coverage score: (number of overlapping entities) / (number of entities in original text)
    coverage_score = len(overlap_entities) / len(context_entities) if len(context_entities) > 0 else 0
    num_overlap_entities = len(overlap_entities)
    
    named_entity_ratio = float(num_entities/total_tokens)
    context_entity_ratio = float(context_num_entities/len(context_tokens))
    
    # 5. Average word length
    avg_word_len = round(sum([len(token) for token in tokens])/total_tokens, ndigits=4)
    context_avg_word_len = round(sum([len(token) for token in context_tokens])/len(context_tokens), ndigits=4)
    
    # 6. Summary Polarity
    text_blob = TextBlob(row['text'])
    context_blob = TextBlob(row['prompt_text'])
    sentiment_polarity = text_blob.sentiment.polarity
    subjectivity = text_blob.subjectivity
    context_subjectivity = context_blob.subjectivity
    context_polarity = context_blob.sentiment.polarity
    error = sum(1 for token in tokens if token not in english)
    error_ratio = error/total_tokens
    
    
    # 7. Stopwords verbs and nouns
    stops,num_nouns, num_verbs, num_adverbs = process_text(doc)
    context_stops,context_nouns, context_verbs, context_adverbs = process_text(context)
    
    # 8. Numerical entities
    num_numerical_entities = len([ent for ent in doc.ents if ent.label_ == "CARDINAL"])
    
    
    # 9. Sentece count
    sentences = [sent.text for sent in doc.sents]
    sentence = len(sentences) if len(sentences) > 0 else 1
    avg_sentence = float(sentence / total_tokens)
    avg_unique_sentence = float(sentence /unique_words) if unique_words > 0 else 0.0
    
    avg_sentence_length = sum(len(sent) for sent in sentences) / sentence
    max_sentence_length = max(len(sent) for sent in sentences)
    min_sentence_length = min(len(sent) for sent in sentences)
    std_sentence_length = (sum((len(sent) - avg_sentence_length)**2 for sent in sentences) / sentence)**0.5
    
    # 10 Tree Depth
    average_tree_depth = average_dependency_tree_depth(doc)
    prob_word = [count/total_tokens for word, count in word_counts.items()]
    results = {
        "nsubj": 0,
        "amod": 0,
        "advmod": 0,
        "xcomp": 0,
        "acomp": 0,
        "past": 0,
        "present": 0
    }
    
    num_syllables = 0
    
    summary_punc = 0
    for token in doc:
        if "VERB" == token.pos_:
            if "VBD" == token.tag_ or "VBN" == token.tag_:
                results["past"] += 1
            elif token.tag_ in {"VBG", "VBP", "VBZ"}:
                results["present"] += 1
        # Check if the token's dependency is one of the desired dependencies
        if token.dep_ in results:
            results[token.dep_] += 1
            
        if token.is_alpha:
            num_syllables += syllable_count(token.text.lower())
            
        if token.is_punct:
            summary_punc += 0
    
    # Unigram similarity
    summary_set = set(tokens) if length > 0 else set()
    context_set = set(context_tokens) if len(context_tokens) > 0 else set()
    intersection = len(context_set.intersection(summary_set))
    difference = len(context_set.difference(summary_set))
    union = len(context_set.union(summary_set))
    precision = float(intersection /unique_words) if unique_words > 0 else 0.0
    recall = float(intersection / len(context_set)) if len(context_set) > 0 else 0.0
    jaccard_similarity = float(intersection/union) if union > 0 else 0.0
    overlap_score = intersection/float(min(len(context_set), len(summary_set)) + 1e-11)
    f1_score = float(2 * ((precision * recall)/(precision + recall + 1e-11)))
    
    
    # Bigram Similarity
    answer_bigrams = set(list(ngrams(tokens, 2))) if length > 1 else set()
    context_bigrams = set(list(ngrams(context_tokens, 2))) if len(context_tokens) > 1 else set()
    intersection_bigram = len(answer_bigrams.intersection(context_bigrams))
    union_bigram = len(answer_bigrams.union(context_bigrams))
    precision_bigram = intersection_bigram /len(answer_bigrams) if len(answer_bigrams) > 0 else 0.0
    recall_bigram = intersection_bigram / len(context_bigrams) if len(context_bigrams) > 0 else 0.0
    jaccard_bigram = intersection_bigram/union_bigram if union_bigram > 0 else 0.0
    bigram_overlap = intersection_bigram/float(min(len(context_bigrams), len(answer_bigrams)) + 1e-11)    
    f1_bigram = 2 * ((precision_bigram * recall_bigram)/(precision_bigram + recall_bigram + 1e-11))
    
    # Trigram Similarity
    answer_trigrams = set(list(ngrams(tokens, 3))) if length > 1 else set()
    context_trigrams = set(list(ngrams(context_tokens, 3))) if len(context_tokens) > 1 else set()
    intersection_trigram = len(answer_trigrams.intersection(context_trigrams))
    union_trigram = len(answer_trigrams.union(context_trigrams))
    precision_trigram = intersection_trigram / len(answer_trigrams) if len(answer_trigrams) > 0 else 0.0
    recall_trigram = intersection_trigram / len(context_trigrams) if len(context_trigrams) > 0 else 0.0
    jaccard_trigram = intersection_trigram/union_trigram if union_trigram > 0 else 0.0
    trigram_overlap = intersection_trigram/float(min(len(context_trigrams), len(answer_trigrams)) + 1e-11)    
    f1_trigram = 2 * ((precision_trigram * recall_trigram)/(precision_trigram + recall_trigram + 1e-11))
    
    # Readability
    ASL = float(total_tokens / sentence)
    ASW = float(num_syllables / total_tokens)

    flesch_reading_ease = 206.835 - (1.015 * ASL) - (84.6 * ASW)
    flesch_kincaid = (0.39 * ASL) + (11.8 * ASW) - 15.59
    
    # Automated Reading Index
    characters = len("".join(tokens))
    summary_ari = 4.71 * (characters / total_tokens) + 0.5 * (total_tokens / sentence) - 21.43
    
    # Coleman-Liau Index Readability
    # Where: L is the average number of letters per 100 words, S is the average number of sentences per 100 words
    L = (characters / total_tokens) * 100
    S = (sentence / total_tokens) * 100
    coleman_index = (0.0588 * L) - (0.296 * S) - 15.8
    
    # Mispelt
    mis_tokens = [token for token in checker.unknown(tokens) if token.isalpha()]
    mispell_ratio = len(mis_tokens)/total_tokens
    quotes = len(re.findall(r'"(.*?)"|\'(.*?)\'|“(.*?)”|‘(.*?)’|«(.*?)»|‹(.*?)›', row['text']))
    
    source_vector = context.vector
    answer_vector = doc.vector
    
    embedding_similarity, euclidean, pearson = compute_similarity_score(source_vector, answer_vector)
    
    
    # Organizing features in a dictionary
    features = {
        'length': length,
        'num_chars': len(row['text']),
        "mispelt_tokens": len(mis_tokens),
        "capital_error": capital_error,
        "mispell_ratio": mispell_ratio,
        "quotes": quotes,
        "intersection_bigram": intersection_bigram,
        "union_bigram": union_bigram,
        "jaccard_bigram": jaccard_bigram,
        'recall_bigram': recall_bigram,
        "precision_bigram": precision_bigram,
        "f1_bigram": f1_bigram,
        'bigram_overlap': bigram_overlap,
        'sentence': sentence,
        'avg_sentence_length': avg_sentence_length, 
        'max_sentence_length': max_sentence_length,
        'min_sentence_length':min_sentence_length,
        "std_sentence_length": std_sentence_length,
        "avg_sentence": avg_sentence,
        'avg_unique_sentence':avg_unique_sentence,
        "context_avg_word_len": context_avg_word_len,
        "error": error,
        'error_ratio': error_ratio,
        'unique_words': unique_words,
        'num_entities': num_entities,
        "context_num_entities": context_num_entities,
        'coverage_score': coverage_score,
        'num_overlap_entities': num_overlap_entities,
        'avg_word_len': avg_word_len,
        'intersection_trigram': intersection_trigram,
        'union_trigram': union_trigram,
        'recall_trigram': recall_trigram,
        'precision_trigram': precision_trigram,
        'jaccard_trigram': jaccard_trigram,
        'trigram_overlap': trigram_overlap,
        'f1_trigram': f1_trigram,
        "stops": stops,
        "num_nouns": num_nouns,
        "num_verbs": num_verbs,
        "num_adverbs": num_adverbs,
        "context_nouns": context_nouns,
        "context_stops": context_stops, 
        'context_verbs': context_verbs,
        'context_adverbs': context_adverbs,
        "num_numerical_entities": num_numerical_entities,
        "sentiment_polarity": sentiment_polarity,
        'context_polarity': context_polarity, 
        "nsubj": results['nsubj'],
        "amod": results['amod'],
        "advmod": results['advmod'],
        "xcomp": results['xcomp'],
        "acomp": results['acomp'],
        "past": results["past"],
        "present": results["present"],
        "average_dependency_tree_depth": average_tree_depth,
        "TTR": unique_words / (total_tokens + 1e-8),
        "RTTR": unique_words / (math.sqrt(total_tokens) + 1e-8),
        "CTTR": unique_words / (math.sqrt(total_tokens / 2) + 1e-8),
        # MATTR would require a window-based approach and is thus more involved
        "Herdan's C": math.log(unique_words) / (math.log(total_tokens) + 1e-8),
        "Dugast's U": math.log(total_tokens)**2 / ((math.log(total_tokens) - math.log(unique_words)) + 1e-8),
        "Honoré's H": 100 * math.log(total_tokens) / (1 - once/(unique_words + 1e-8)),
        "Entropy": -sum(p * math.log(p) for p in prob_word),
        "Sichel’s S": twice,
        'named_entity_ratio': named_entity_ratio,
        'context_entity_ratio': context_entity_ratio,
        'flesch_reading_ease': flesch_reading_ease,
        'flesch_kincaid': flesch_kincaid,
        'summary_ari':summary_ari,
        'coleman_index': coleman_index,
        "Simpson’s D": sum((count/total_tokens)**2 for count in word_counts.values()),
        'intersection': intersection,
        "subjectivity": subjectivity,
        "context_subjectivity": context_subjectivity,
        'difference': difference,
        'union': union,
        'overlap_score': overlap_score,
        'recall': recall,
        'precision': precision,
        'f1_score': f1_score,
        'jaccard_similarity': jaccard_similarity,
        'embedding_similarity': embedding_similarity,
        'euclidean': euclidean, 
        'pearson': pearson,
        'summary_punc': summary_punc,
    }
    
    return features

def compute_similarity_score(source_vector, answer_vector):
    embedding_similarity = cosine_similarity([source_vector], [answer_vector])[0][0]
    euclidean = euclidean_distances([source_vector], [answer_vector])[0][0]
    pearson = np.corrcoef(source_vector.ravel(), answer_vector.ravel())[0, 1]
    
    return embedding_similarity, euclidean, pearson

def compute_statistical_measures(embedding):
    negatives_count = np.sum(embedding < 0, axis=1)
    negatives_count_ratio = negatives_count/768
    positives_count = np.sum(embedding > 0,axis=1)
    neg_pos_ratio = negatives_count/positives_count
    embed_sum = np.sum(embedding, axis=1)
    embed_mean = np.mean(embedding, axis=1)
    embed_std = np.std(embedding, axis=1)
    embed_var = np.var(embedding, axis=1)
    quantile_1 = np.quantile(embedding,0.25,axis=1)
    quantile_2 = np.quantile(embedding,0.5,axis=1)
    quantile_3 = np.quantile(embedding,0.75,axis=1)
    quantile_4 = np.quantile(embedding,1,axis=1)
    embed_range = np.max(embedding,axis=1) - np.min(embedding,axis=1)
    output =  np.asarray([embed_range,embed_sum,embed_mean, embed_std, embed_var,
                          negatives_count,negatives_count_ratio,neg_pos_ratio,
                          quantile_1,quantile_2, quantile_3,quantile_4],dtype=np.float32).T
    return output

def compute_tensor(texts):
    outputs = tokenizer(texts,  max_length=256,padding='max_length',
                    truncation=True,return_tensors='tf')
    tensor = model(**outputs)['last_hidden_state']
    return tensor
def compute_scores(batch_texts,max_len=256):
    tensor = compute_tensor(batch_texts)
    embed_max = tf.reduce_max(tensor, axis=1).cpu().numpy()
    embed_mean = tf.reduce_mean(tensor, axis=1).cpu().numpy()
    embed_first = tensor[:,0,:].cpu().numpy()
    
    max_features = compute_statistical_measures(embed_max)
    mean_features = compute_statistical_measures(embed_mean)
    first_features = compute_statistical_measures(embed_first)
    features_batch = np.hstack((embed_mean, max_features, mean_features, first_features))
    return features_batch

In [None]:
def preprocess_data(summary_df, prompt_df, BATCH_SIZE=8, max_len=256):
    train_df = summary_df.merge(right=prompt_df, how='inner', on='prompt_id')

    print('processing text embeddings......')
    docs = list(nlp.pipe(train_df['text']))
    context_docs = list(nlp.pipe(train_df['prompt_text']))
    
    # processing lemmas
    print('extracting features.....')
    drop_cols = [
                "mispelt_tokens","mispell_ratio",'error_ratio',
                'context_nouns','context_entity_ratio',
                'context_verbs', 'context_adverbs','context_subjectivity'
    ]
    data_cols = ['context_subjectivity', 'difference', 'recall',
        'union','union_trigram', 'recall_trigram',
        'union_bigram', 'recall_bigram',
        'context_nouns','context_entity_ratio',
        'context_verbs', 'context_adverbs']
    
    data = pd.DataFrame(train_df.apply(extract_features, args=(docs, context_docs), axis=1).tolist())
    content_data = data.drop(drop_cols, axis=1)
    data = data.drop(data_cols, axis=1)
    
    print('processing features.....')
    # Process other columns
    scores = []
    num_samples = len(train_df)
    for start_idx in tqdm(range(0, num_samples, BATCH_SIZE), desc="processing feature extraction"):
        end_idx = min(start_idx + BATCH_SIZE, num_samples)
        batch_texts = train_df.iloc[start_idx:end_idx]['text'].apply(speller).tolist()

        with tf.device('/GPU:0'):
            scores_batch = compute_scores(batch_texts, max_len=max_len)
            scores.extend(scores_batch)
            
            K.clear_session()

    bert_features = np.asarray(scores, dtype=np.float32)

    content_bag = np.hstack((bert_features,content_data.values, train_df[['content']].values))
    wording_bag = np.hstack((bert_features,data.values, train_df[['wording']].values))
    
    print('collecting outputs.........')
    student_ids = train_df['student_id'].values

    return content_bag, wording_bag, student_ids

In [None]:
import time

start_time = time.time()

content_bag, wording_bag, student_ids = preprocess_data(summary_df,prompt_df,BATCH_SIZE=32,
                                                        max_len=128)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Preprocessed function executed in: {elapsed_time:.2f} seconds")

In [None]:
def split_data(df):
    X_train,X_test,y_train,y_test = train_test_split(df[:,:-1] ,df[:,-1],test_size=0.1,shuffle=True,random_state=11)
    return X_train,X_test,y_train,y_test

In [None]:
def mcrmse(y_true, y_pred):
    return np.mean(np.sqrt(np.mean((y_true - y_pred)**2, axis=0)))

### Train Model

In [None]:
from sklearn.pipeline import make_pipeline
def train_model(df):
    X_train,X_test,y_train,y_test = split_data(df)
    start_time = time.time()

    best_params = {
        'learning_rate': 0.01,
        'n_estimators': 600,
        'max_depth': 11,
        'subsample': 0.6,
        'colsample_bytree': 0.6,
        'objective': 'regression',
        'metric': 'rmse'
    }

    best_models = []

    score = []
    kfold = KFold(n_splits=20, shuffle=True, random_state=64)
    for train_idx, test_idx in tqdm(kfold.split(X_train, y_train), desc='training model'):
        X, y = X_train[train_idx], y_train[train_idx]
        test,test_y = X_train[test_idx], y_train[test_idx]
        pipeline = make_pipeline(
            MinMaxScaler(feature_range=(-10,10)),
            lgb.LGBMRegressor(**best_params, random_state=64)
        )
        
        pipeline.fit(X, y)
        score.append(np.sqrt(mean_squared_error(test_y,pipeline.predict(test))))
        best_models.append(pipeline)

    print(f'Train Mean Score: {np.mean(score):.4f}')
    
    best_scores = []
    for i in range(len(best_models)):
        best_model = best_models[i]
        y_preds = best_model.predict(X_test)
        best_scores.append(np.sqrt(mean_squared_error(y_test,y_preds)))
    print(f'Evaluation Mean Score: {np.mean(best_scores):.4f}')
    
    best_model_idx = np.argmin(best_scores)
    best_model = best_models[best_model_idx]
    
    y_preds = best_model.predict(X_test)
    score = np.sqrt(mean_squared_error(y_test,y_preds))
    
    end_time = time.time()
    elapsed_time = end_time - start_time 
    print(f"Preprocessed function executed in: {elapsed_time:.2f} seconds")
    print(f'Best Model Score: {score:.4f}')
    
    return best_model, y_test, y_preds

In [None]:
content_model, content_test, content_preds = train_model(content_bag)

In [None]:
wording_model, wording_test, wording_preds  = train_model(wording_bag)

In [None]:
y_test = np.hstack((content_test.reshape((-1,1)), wording_test.reshape((-1,1))))
preds = np.hstack((content_preds.reshape((-1,1)), wording_preds.reshape((-1,1))))

In [None]:
mcrmse(y_test, preds)

<!-- 0.3911, 0.5052
0.4485718981897328 -->

0.3977, 0.5024, 0.4500634205747211

In [None]:
# from joblib import dump

# # Save the pipeline to a file
# dump(content_model, 'content_model.joblib')
# dump(wording_model, 'wording_model.joblib')

### Preprocessing for text data

In [None]:
K.clear_session()
def preprocess_data(summary_df, prompt_df, BATCH_SIZE=8, max_len=256):
    train_df = summary_df.merge(right=prompt_df, how='inner', on='prompt_id')

    print('processing text embeddings......')
    docs = list(nlp.pipe(train_df['text']))
    context_docs = list(nlp.pipe(train_df['prompt_text']))
    
    # processing lemmas
    print('extracting features.....')
    drop_cols = [
                "mispelt_tokens","mispell_ratio",'error_ratio',
                'context_nouns','context_entity_ratio',
                'context_verbs', 'context_adverbs','context_subjectivity'
    ]
    data_cols = ['context_subjectivity', 'difference', 'recall',
        'union','union_trigram', 'recall_trigram',
        'union_bigram', 'recall_bigram',
        'context_nouns','context_entity_ratio',
        'context_verbs', 'context_adverbs']
    
    data = pd.DataFrame(train_df.apply(extract_features, args=(docs, context_docs), axis=1).tolist())
    content_data = data.drop(drop_cols, axis=1)
    data = data.drop(data_cols, axis=1)
    
    print('processing features.....')
    # Process other columns
    scores = []
    num_samples = len(train_df)
    for start_idx in tqdm(range(0, num_samples, BATCH_SIZE), desc="processing feature extraction"):
        end_idx = min(start_idx + BATCH_SIZE, num_samples)
        batch_texts = train_df.iloc[start_idx:end_idx]['text'].apply(speller).tolist()

        with tf.device('/GPU:0'):
            scores_batch = compute_scores(batch_texts, max_len=max_len)
            scores.extend(scores_batch)
            
            K.clear_session()

    bert_features = np.asarray(scores, dtype=np.float32)
    
    content_bag = np.hstack((bert_features,content_data.values))
    wording_bag = np.hstack((bert_features,data.values))
    
    print('collecting outputs.........')
    student_ids = train_df['student_id'].values

    return content_bag, wording_bag, student_ids

In [None]:
content, wording,ids = preprocess_data(test_summary, test_prompts,BATCH_SIZE=8,max_len=128)

In [None]:
cont_preds = content_model.predict(content).reshape((-1,1))
word_preds = wording_model.predict(wording).reshape((-1,1))
y_preds = np.hstack((cont_preds, word_preds))

In [None]:
df_test = pd.DataFrame(y_preds,columns=['content','wording'],index=ids).reset_index()
df_test = df_test.rename({'index':'student_id'},axis=1)

In [None]:
def is_valid_float(x):
    return isinstance(x, float) and x == x  # This checks that x is not NaN since NaN != NaN in Python.

cols_to_check = ['wording', 'content']
df_test[cols_to_check] = df_test[cols_to_check].applymap(lambda x: x if is_valid_float(x) else 0.0)

df_test.to_csv("submission.csv", index=False)