# 1. Import libraries

In [None]:
!pip install "./spellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
import torch
import numpy as np
import re
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from pathlib import Path
from tqdm import tqdm
from lightgbm import LGBMClassifier
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces, strip_numeric, strip_punctuation
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker

# 2. Loading data

In [None]:
train_data = Dataset.from_pandas(pd.read_csv("./learning-agency-lab-automated-essay-scoring-2/train.csv"))
test_data = Dataset.from_pandas(pd.read_csv("./learning-agency-lab-automated-essay-scoring-2/test.csv"))

In [None]:
train_data_df = train_data.to_pandas()
test_data_df = test_data.to_pandas()

# 3. Loading pre trained deberta model trained on same data

In [None]:
# model on https://www.kaggle.com/datasets/diacious/deberta-scoring/settings
MODEL_PATH = "./deberta-scoring/output/checkpoint-5188"
MAX_LENGTH = 1024

In [None]:
labels = set(train_data['score'])
id2label = {l-1: l for l in labels}
label2id = {v: k for k, v in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model =  AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
#model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,
#                                                          id2label=id2label,
#                                                          label2id=label2id,
#                                                         ignore_mismatched_sizes=True)

id2label = model.config.id2label
label2id = model.config.label2id

In [None]:
def tokenize_texts(data):
    return tokenizer(data['full_text'],
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding='max_length',
                    #return_overflowing_tokens=True,
                    add_special_tokens = True)

In [None]:
test_data_1 = test_data.map(tokenize_texts, batched=True)
train_data_1 = train_data.map(tokenize_texts, batched=True)
train_data_1 = train_data_1.rename_column('score', 'labels')
train_data_1 = train_data_1.map(lambda x: {'labels': x['labels'] - 1})

In [None]:
def regulize_bert_params(model, freeze_layers=6, freeze_embedding=False):
    for param in model.deberta.embeddings.parameters():
            param.requires_grad = False if freeze_embedding else True

    for layer in model.deberta.encoder.layer[:freeze_layers]:
        for param in layer.parameters():
            param.requires_grad = False

    return model

In [None]:
model = regulize_bert_params(model,
                             freeze_layers=24,
                             freeze_embedding=True)
model_base = model.base_model

# 3. Getting features from deberta model

In [None]:
device = 'cuda' if torch.backends.cuda.is_built() else 'cpu'
model.to(device)

model.eval()
def get_predictions(data):
    #model_base.eval()
    data = {k: v.to(device) for k, v in data.items()}
    
    with torch.no_grad():
        res = model(data['input_ids'], data['attention_mask'], data['token_type_ids']).logits
    #gc.collect()
    #torch.cuda.empty_cache()
    
    return {'deberta_out': res}

In [None]:
test_data_2 = test_data_1.remove_columns(['essay_id', 'full_text'])
test_data_2.set_transform(lambda x: {k: torch.tensor(v) for k, v in x.items()})

gc.collect()
torch.cuda.empty_cache()
test_data_3 = test_data_2.map(get_predictions, batched=True, batch_size=16)

In [None]:
features_names = [f'deberta_feat_{i}' for i in range(6)]

deberta_features_test = pd.DataFrame(test_data_1['essay_id'], columns=['essay_id'])
deberta_features_test[features_names] = test_data_3['deberta_out']

deberta_features_train = pd.read_csv("./deberta_features/deberta_features_logits.csv")

In [None]:
deberta_features_train.head()

In [None]:
deberta_features_test.head()

# 4. Feature Engineering

In [None]:
text_transforms = [lambda x: x.lower(),
                   lambda x: re.sub("https?.+", '',x),
                   lambda x: re.sub(r'<.*?>', '', x),
                   lambda x: x.replace(u'\xa0',' '),
                   remove_stopwords, 
                   strip_multiple_whitespaces, 
                   strip_numeric, 
                   strip_punctuation,
                   lambda x: x.strip()
                  ]

def preprocess_text(text):
    for transform in text_transforms:
        text = transform(text)
    return text


def texts_preprocessing(df):
    df['full_text_pr'] = df['full_text'].apply(lambda x: preprocess_text(x))
    df['paragraphs'] = df['full_text'].apply(lambda x: x.split('\n\n'))
    df['sentences_per_par'] = df['paragraphs'].apply(lambda x: [sent_tokenize(par) for par in x])
    df['words_per_par'] = df['paragraphs'].apply(lambda x: [word_tokenize(preprocess_text(par)) for par in x])
    df['words_per_par_all'] = df['paragraphs'].apply(lambda x: [word_tokenize(par) for par in x])
    
    df['sentences_per_par'] = df['sentences_per_par'].apply(lambda x: [[preprocess_text(sent) for sent in par] for par in x])
    
stopwords_set = set(stopwords.words('english'))
statistics_func = [('max', max), ('min', min), ('mean', np.mean)]
spellchecker = SpellChecker()

def make_features(df):
    texts_features = pd.DataFrame()
    
    texts_features['text_len'] = df['full_text_pr'].apply(lambda x: len(x))

    texts_features['punct_count'] = df['full_text'].apply(lambda x: len(re.findall(r'[^\w\s]', x)))
    texts_features['punct_count_per_par'] = df['paragraphs'].apply(lambda x: [len(re.findall(r'[^\w\s]', par)) for par in x])
    
    texts_features['sentences_count'] = df['sentences_per_par'].apply(lambda x: sum([len(par) for par in x]))
    texts_features['sentences_count_per_par'] = df['sentences_per_par'].apply(lambda x: [len(par) for par in x])
    
    texts_features['words_count'] = df['words_per_par'].apply(lambda x: sum([len(par) for par in x]))
    texts_features['words_count_per_par'] = df['words_per_par'].apply(lambda x: [len(par) for par in x])
    
    texts_features['paragraphs_len'] = df['paragraphs'].apply(lambda x: [len(par) for par in x])
    texts_features['paragraphs_count'] = df['paragraphs'].apply(lambda x: len(x))
    
    texts_features['words_len_all'] = df['words_per_par'].apply(lambda x: [len(word) for par in x if len(par) > 0 for word in par])
    texts_features['sentences_len_all'] = df['sentences_per_par'].apply(lambda x: [len(sent) for par in x for sent in par])
    
    texts_features['n_unique_words'] = df['words_per_par'].apply(lambda x: len(set([word for par in x for word in par])))
    texts_features['n_stopwords'] = df['words_per_par_all'].apply(lambda x: sum([word in stopwords_set for par in x for word in par]))
    texts_features['n_misspelled_words'] = df['words_per_par'].apply(lambda x: len(spellchecker.unknown([word for par in x for word in par])))
    
    for func_name, func in statistics_func:
        texts_features[f'punct_count_per_par_{func_name}'] = texts_features['punct_count_per_par'].apply(func)
        texts_features[f'sentences_len_{func_name}'] = df['sentences_per_par'].apply(lambda x: func([len(sent) for par in x for sent in par]))
        texts_features[f'sentences_count_{func_name}'] = df['sentences_per_par'].apply(lambda x: func([len(par) for par in x]))
        texts_features[f'words_len_{func_name}'] = df['words_per_par'].apply(lambda x: func([len(word) for par in x for word in par]))
        texts_features[f'words_count_{func_name}'] = df['words_per_par'].apply(lambda x: func([len(par) for par in x]))
        texts_features[f'paragraphs_len_{func_name}'] = df['paragraphs'].apply(lambda x: func([len(par) for par in x]))
        
    return texts_features


texts_preprocessing(train_data_df)
texts_preprocessing(test_data_df)

texts_features_train = make_features(train_data_df)
texts_features_test = make_features(test_data_df)

In [None]:
punct_bound = [10, 25, 40, 55, 75, 90, 100]
sentence_count_bounds = [5, 15, 20, 25, 30, 35, 50]
sentence_len_bounds = [20, 50, 60, 74, 85, 100, 1000]
words_count_bounds = [80, 100, 150, 200, 400, 500]
paragraphs_len_bounds = [30, 100, 300, 450, 550, 1000, 1500]
words_len_bounds = [5, 7, 9, 12, 20]

bounds = [punct_bound,
         sentence_count_bounds,
         words_count_bounds,
         paragraphs_len_bounds,
         words_len_bounds,
         sentence_len_bounds]

col_names = [('punct_per_par_bound', 'punct_count_per_par'),
            ('sentences_count_per_par_bound', 'sentences_count_per_par'),
            ('words_count_per_par_bound', 'words_count_per_par'),
            ('paragraphs_len_bound', 'paragraphs_len'),
            ('words_len_all_bound', 'words_len_all'),
            ('sentences_len_all_bound', 'sentences_len_all')]


def get_features_bounds(df, name, col, bounds):
    for i in range(len(bounds)):
        lower_bound = bounds[i - 1] if i - 1 >= 0 else 0
        upper_bound = bounds[i]
        
        df[f'{name}_{lower_bound}_{upper_bound}'] = df[col].apply(lambda x: sum([1 for el in x if lower_bound < el <= upper_bound]))
        
    df[f'{name}_{bounds[-1]}'] = df[col].apply(lambda x: sum([1 for el in x if  el > bounds[-1]]))
        
def get_paragraphs_features(features):
    for (name, col), bound in zip(col_names, bounds):
        get_features_bounds(features, name, col, bound)

    features = features.drop([col[1] for col in col_names], axis=1)
    return features 
    
    
texts_features_train = get_paragraphs_features(texts_features_train)
texts_features_test = get_paragraphs_features(texts_features_test)

In [None]:
from nltk.probability import FreqDist

def get_prob_cum_sum(df):
    fdist = FreqDist(word for essay in df['words_per_par'].to_list() for par in essay for word in par)

    cum_sum = []

    for i, pair in enumerate(sorted(fdist.items(), key=lambda x: x[1])):
        if cum_sum:
            cum_sum.append((pair[0], cum_sum[i - 1][1] + fdist.freq(pair[0])))
        else:
            cum_sum.append((pair[0], fdist.freq(pair[0])))

    cum_sum_dict = dict(cum_sum)
    
    return cum_sum_dict

words_freq_bounds = [0.05, 0.2, 0.5, 0.75, 0.95, 1]
def add_freq_bounds(df, features, cum_sum):
    for i in range(len(words_freq_bounds)):
        upper_bound = words_freq_bounds[i]
        lower_bound = words_freq_bounds[i - 1] if i - 1 >= 0 else 0

        features[f'num_words_lower_{lower_bound}_{upper_bound}'] = df['words_per_par'].apply(lambda x: sum([1 for par in x for word in par if lower_bound < cum_sum.get(word, 0) <= upper_bound]))
        
        
cum_sum = get_prob_cum_sum(train_data_df)
add_freq_bounds(train_data_df, texts_features_train, cum_sum)
add_freq_bounds(test_data_df, texts_features_test, cum_sum)

In [None]:
texts_features_train['essay_id'] = train_data_df['essay_id']
texts_features_train = texts_features_train.merge(deberta_features_train, on='essay_id', how='inner')


texts_features_test['essay_id'] = test_data_df['essay_id']
texts_features_test = texts_features_test.merge(deberta_features_test, on='essay_id', how='inner')

In [None]:
texts_features_train['score'] = train_data_df['score']
texts_features_train = texts_features_train.drop(['essay_id'], axis=1)

texts_features_test = texts_features_test.drop(['essay_id'], axis=1)
texts_features_train.head()

In [None]:
stopwords_list = stopwords.words('english')
tf_idf_char_level = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(1,3),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)

tf_idf_word_level_clened_text = TfidfVectorizer(
    strip_accents='ascii',
    analyzer = 'word',
    ngram_range=(1,1),
    min_df=0.15,
    max_df=0.85,
    sublinear_tf=True,
    stop_words=stopwords_list,
)

tf_idf_word_level_full_text = word_vectorizer = TfidfVectorizer(
    strip_accents='ascii',
    analyzer = 'word',
    ngram_range=(1,1),
    min_df=0.03,
    max_df=0.95,
    sublinear_tf=True
)

X_chars = tf_idf_char_level.fit_transform(train_data_df['full_text'])
X_words_full = tf_idf_word_level_full_text.fit_transform(train_data_df['full_text'])
X_words_cleaned = tf_idf_word_level_clened_text.fit_transform(train_data_df['full_text_pr'])

In [None]:
df_chars = pd.DataFrame(X_chars.toarray())
df_words = pd.DataFrame(X_words_full.toarray())
df_words_cleaned = pd.DataFrame(X_words_cleaned.toarray())

tfid_w_columns = [f'tfid_w_{i}' for i in range(len(df_words.columns))]
tf_idf_w_cl_columns = [f'tfid_w_cl_{i}' for i in range(len(df_words_cleaned.columns))]

df_words.columns = tfid_w_columns
df_words_cleaned.columns = tf_idf_w_cl_columns

data = pd.concat((df_chars, df_words, df_words_cleaned, texts_features_train), axis=1)

In [None]:
X_chars_test = tf_idf_char_level.transform(test_data_df['full_text'])
X_words_full_test = tf_idf_word_level_full_text.transform(test_data_df['full_text'])
X_words_cleaned_test = tf_idf_word_level_clened_text.transform(test_data_df['full_text_pr'])

df_chars_test = pd.DataFrame(X_chars_test.toarray())
df_words_test = pd.DataFrame(X_words_full_test.toarray())
df_words_cleaned_test = pd.DataFrame(X_words_cleaned_test.toarray())

tfid_w_columns_test = [f'tfid_w_{i}' for i in range(len(df_words_test.columns))]
tf_idf_w_cl_columns_test = [f'tfid_w_cl_{i}' for i in range(len(df_words_cleaned_test.columns))]

df_words_test.columns = tfid_w_columns_test
df_words_cleaned_test.columns = tf_idf_w_cl_columns_test

test_data = pd.concat((df_chars_test, df_words_test, df_words_cleaned_test, texts_features_test), axis=1)

In [None]:
test_data.head()

In [None]:
data.head()

# 5. Trainning lgb model on custom loss

In [None]:
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb

In [None]:
# loos from  https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_true, y_pred):
    y_true = (y_true + a).round()
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True

def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess

def qwk_param_calc(y):
    a = y.mean()
    b = (y ** 2).mean() - a**2
    return np.round(a, 4), np.round(b, 4)

a = 2.998
b = 1.092

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, cohen_kappa_score
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold

callbacks = [
    lgb.log_evaluation(period=25), 
    lgb.early_stopping(stopping_rounds=200,first_metric_only=True)
]

models = []
skf = StratifiedKFold(n_splits=5, shuffle=True)

for train_idx, val_idx in skf.split(data.iloc[:, :-1], data.iloc[:, -1]):
    #print(train_idx.shape, val_idx.shape)
    X_train, y_train = data.iloc[train_idx, :-1], data.iloc[train_idx, -1]
    X_val, y_val = data.iloc[val_idx, :-1], data.iloc[val_idx, -1]
    
    y_train -= a
    y_val -= a

    model = LGBMRegressor(
            objective = qwk_obj, metrics = 'None', learning_rate = 0.1, max_depth = 5,
            num_leaves = 10, colsample_bytree=0.5, reg_alpha = 0.1, reg_lambda = 0.8,
            n_estimators=1024, verbosity = - 1
        )

    lgb_model = model.fit(
            X_train, y_train,
            eval_names=['train', 'valid'],
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric=quadratic_weighted_kappa,
            callbacks=callbacks
        )
    
    models.append(lgb_model)

# 6. Making predictions

In [None]:
preds = []
for _, model in enumerate(models):
    pred = model.predict(test_data) + a
    preds.append(pred)

# Combining the 5 model results
for i, pred in enumerate(preds):
    test_data[f"score_pred_{i}"] = pred
test_data_df["score"] = np.round(test_data[[f"score_pred_{fold}" for fold in range(5)]].mean(axis=1),0).astype('int32')

In [None]:
test_data_df[["essay_id", "score"]].to_csv("submission.csv", index=False)

In [None]:
importances = lgb_model.feature_importances_
feature_importances = pd.Series(importances, index=data.iloc[:, :-1].columns, name='Features Importance').sort_values()
feature_importances.iloc[-20:].to_frame().plot.barh(figsize=(10, 10))