In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"
!pip install "/kaggle/input/pyphen-0100/Pyphen-0.10.0-py3-none-any.whl"

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25l- \ done
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l- \ done
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622383 sha256=013446ce1f1f04d742d3606ff3079cc02d602500499a45d24b3556f5e6675181
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Processing /kaggle/input/pyphen-0100/Pyphen-0.10.0-py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.10.0


### Meta Data Cleansing


In [3]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

class FeatureEngineering:
    
    def __init__(self, df):
        self.df = df
        self.df['grade'].fillna(0, inplace=True)  # Fill NA values in 'grade' with 0

    def classify_author(self, author):
        doc = nlp(author)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return 'person'
        return 'org'

    def encode_author_type(self):
        self.df['author_type'] = self.df['author'].apply(self.classify_author)
        le = LabelEncoder()
        self.df['author_type'] = le.fit_transform(self.df['author_type'])

    def frequency_encoding(self):
        logging.info("Applying Frequency Encoding on 'author'")
        self.df['author_frequency'] = self.df['author'].map(self.df['author'].value_counts())

    def one_hot_encoding(self):
        logging.info("Applying One-Hot Encoding on 'genre'")
        onehot_encoder = OneHotEncoder(sparse=False)
        genre_onehot = onehot_encoder.fit_transform(self.df[['genre']])
        df_onehot = pd.DataFrame(genre_onehot, columns=onehot_encoder.get_feature_names_out(['genre']))
        self.df = pd.concat([self.df, df_onehot], axis=1)

    def feature_scaling(self):
        logging.info("Applying Feature Scaling on 'lexile'")
        scaler = StandardScaler()
        self.df['lexile_scaled'] = scaler.fit_transform(self.df[['lexile']])

    def transform(self):
        self.encode_author_type()
        self.frequency_encoding()
#         self.one_hot_encoding()
        self.feature_scaling()
        return self.df

# Initialize FeatureEngineering class and apply transformations
prompt_grade = pd.read_csv(r'/kaggle/input/commonlit-texts/commonlit_texts.csv')
feature_engineer = FeatureEngineering(prompt_grade)
transformed_df = feature_engineer.transform()

# Display the transformed DataFrame
prompt_grade = transformed_df

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
prompt_grade

Unnamed: 0,title,author,description,grade,genre,lexile,path,is_prose,date,intro,excerpt,license,notes,author_type,author_frequency,lexile_scaled
0,"(love song, with two goldfish)",Grace Chua,The speaker describes a love story between two...,8,Poem,,/en/texts/love-song-with-two-goldfish,0,2003,Grace Chua is an award-winning journalist whos...,"[1]\n(He's a drifter,\nA “drifter” is a person...","""(love song, with two goldfish)"" from\n Quarte...",A “drifter” is a person who is continually mov...,1,2,
1,10 Things You Can Do to Avoid Fraud,Federal Trade Commission,The Federal Trade Commission discusses what pe...,10,Informational Text,1100.0,/en/texts/10-things-you-can-do-to-avoid-fraud,1,,Whether they come in the form of an email clai...,[1]\nInternational scam artists use clever sch...,"""10 Things You Can Do to Avoid Fraud"" by Feder...",to illegally obtain money from someone\nFoil\n...,0,1,0.463568
2,"100 years ago: An election, a virus and a cry ...",Michael E. Ruane,"In 2020, a news reporter takes a look back at ...",11,Informational Text,1140.0,/en/texts/100-years-ago-an-election-a-virus-an...,1,2020,Michael E. Ruane is a general assignment repor...,[1]\nA critical election loomed. The country w...,"""100 years ago: An election, a virus and a cry...",significant decline in economic activity\nStri...,1,1,0.625670
3,13 Concussions,Casey Cochran,"In this article, a former college football pla...",9,Informational Text,810.0,/en/texts/13-concussions,1,2016,Casey Cochran played college football at the U...,[1]\nIt was a beautiful night in late August. ...,"""13 Concussions"" from \nThe Players' Tribune\n...",Brigham Young University - the Cougar is their...,1,1,-0.711674
4,The 13th,Shelley Walden,"When the COVID-19 lockdown begins, a superstit...",6,Short Story,700.0,/en/texts/the-13th,1,2021,Superstitions exist in many different cultures...,[1]\nAs Angela stared out the school bus windo...,"""The 13th"" by Shelley Walden. Copyright © 202...",Coincidence\n \n(noun) : \nan event that happe...,1,2,-1.157455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,"Yul Kwon, From Bullying Target to Reality TV Star",NPR Staff,Yul Kwon reflects on his decision to join the ...,9,News,1200.0,/en/texts/yul-kwon-from-bullying-target-to-rea...,1,2012,Yul Kwon's early life was mired with a host of...,[1]\nYul Kwon first earned his game-changer st...,"©2012 National Public Radio, Inc. News report ...",Phi Beta Kappa is an honorary society of colle...,0,9,0.868824
2395,Yusuf and the Great Big Brownie Mistake,Aisha Saeed,A boy burns the brownies he bakes for his fami...,5,Short Story,680.0,/en/texts/yusuf-and-the-great-big-brownie-mistake,1,2020,Aisha Saeed is the New York Times-bestselling ...,[1]\nEid lights twinkled along the curved entr...,From ONCE UPON AN EID edited by S. K. Ali and ...,Traditional\n \n(adjective) : \nof the ways of...,1,1,-1.238506
2396,Zap It!,Tracy Vonder Brink,Tracy Vonder Brink explains the science behind...,4,Informational Text,640.0,/en/texts/zap-it,1,2022,"In this informational text, Tracy Vonder Brink...","[1]\nIn 1946, a scientist named Percy Spencer ...","""Zap It!"" by Tracy Vonder Brink. Copyright © 2...",Energy\n \n(noun) : \nthe power needed to do w...,1,25,-1.400609
2397,Zebra and Wasp,Clare Mishica,A zebra helps a wasp trapped in a spider web.,3,Fable,540.0,/en/texts/zebra-and-wasp,1,2017,Clare Mishica has written for \nHighlights\n. ...,"[1]\nOne sunny morning, Zebra visited the rive...",All Highlights material is copyrighted by High...,Wail\n \n(verb) : \nto cry out in pain\na larg...,0,4,-1.805864


In [5]:
prompt_grade['description'].unique()

array(['The speaker describes a love story between two goldfish in a fish bowl.',
       'The Federal Trade Commission discusses what people can do to protect themselves against fraud.',
       'In 2020, a news reporter takes a look back at the difficult events Americans faced during the Roaring Twenties.',
       ...,
       'Tracy Vonder Brink explains the science behind how microwave ovens work.',
       'A zebra helps a wasp trapped in a spider web.',
       'Kathryn Hulick discusses real-life zombies and the organisms that control them.'],
      dtype=object)

In [6]:
keep_columns = ['title','author','description','grade','genre','lexile','lexile_scaled','is_prose','author_type','author_frequency']
prompt_grade = prompt_grade[keep_columns]

In [7]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import gc
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import optuna
import optuna.integration.lightgbm as lgb
import pyphen
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
from nltk import ne_chunk, word_tokenize, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb
import time
import collections
from termcolor import colored
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)



In [8]:
class CFG:
    model_name="/kaggle/input/debertav3base"
    preproc_type=4
    pooling="ClsPooling"
    learning_rate=1.5e-5
    warmup_ratio=0.01
    weight_decay=0.02
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    num_layers_to_freeze=99
    num_train_epochs=5
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=75
    max_length=512
    adjustment_factor= 0.5

## Joining prmpt and meta data

In [9]:
def preprocess_and_join(df1, df2, df1_title_col, df2_title_col, grade_col):
    # Copy dataframes to avoid modifying the originals
    df1 = df1.copy()
    df2 = df2.copy()

    # Preprocess titles
    df1[df1_title_col] = df1[df1_title_col].str.replace('"', '').str.strip()
    df2[df2_title_col] = df2[df2_title_col].str.replace('"', '').str.strip()

    df2 = df2.drop_duplicates(subset=df2_title_col, keep='first')

    merged_df = df1.merge(df2, how='left', left_on=df1_title_col, right_on=df2_title_col)
    
    merged_df[grade_col] = merged_df[grade_col].fillna(0)
    merged_df[grade_col] = merged_df[grade_col].astype(int).astype('category')

 
    return merged_df

DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"
prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")
# prompt_grade = pd.read_csv(r'/kaggle/input/litess-titles/all_titles.csv')
prompts_train = preprocess_and_join(prompts_train, prompt_grade, 'prompt_title', 'title', 'grade')
prompts_test = preprocess_and_join(prompts_test, prompt_grade, 'prompt_title', 'title', 'grade')

In [10]:
prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,title,author,description,grade,genre,lexile,lexile_scaled,is_prose,author_type,author_frequency
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,On Tragedy,Aristotle,"This excerpt from Aristotle's famous work ""Poe...",9,Philosophy,1070.0,0.341991,1,1,2
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,Egyptian Social Structure,USHistory.org,This informational text describes the social s...,7,Informational Text,890.0,-0.387469,1,0,42
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,The Third Wave,CommonLit Staff,"In 1967, a history teacher's social experiment...",9,Informational Text,1260.0,1.111977,1,0,24
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",Excerpt from The Jungle,Upton Sinclair,"In this disturbing piece of political fiction,...",11,Fiction - General,1400.0,1.679335,1,0,1


In [11]:
prompts_test

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,title,author,description,grade,genre,lexile,lexile_scaled,is_prose,author_type,author_frequency
0,abc123,Summarize...,Example Title 1,Heading\nText...,,,,0,,,,,,
1,def789,Summarize...,Example Title 2,Heading\nText...,,,,0,,,,,,


In [12]:
dic = pyphen.Pyphen(lang='en')
sid = SentimentIntensityAnalyzer()

class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"{model_name}")
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    def sentiment_analysis(self, text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity, analysis.sentiment.subjectivity
    
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def calculate_unique_words(self,text):
        unique_words = set(text.split())
        return len(unique_words)
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def calculate_pos_ratios(self , text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    def count_syllables(self,word):
        hyphenated_word = dic.inserted(word)
        return len(hyphenated_word.split('-'))

    def flesch_reading_ease_manual(self,text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        flesch_score = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
        return flesch_score
    
    def flesch_kincaid_grade_level(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
        return fk_grade
    
    def gunning_fog(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        complex_words = sum(1 for word in TextBlob(text).words if self.count_syllables(word) > 2)

        if total_sentences == 0 or total_words == 0:
            return 0

        fog_index = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
        return fog_index
    
    def calculate_sentiment_scores(self,text):
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def count_difficult_words(self, text, syllable_threshold=3):
        words = TextBlob(text).words
        difficult_words_count = sum(1 for word in words if self.count_syllables(word) >= syllable_threshold)
        return difficult_words_count


    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        prompts['gunning_fog_prompt'] = prompts['prompt_text'].apply(self.gunning_fog)
        prompts['flesch_kincaid_grade_level_prompt'] = prompts['prompt_text'].apply(self.flesch_kincaid_grade_level)
        prompts['flesch_reading_ease_prompt'] = prompts['prompt_text'].apply(self.flesch_reading_ease_manual)
      
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        input_df['flesch_reading_ease'] = input_df['text'].apply(self.flesch_reading_ease_manual)
        input_df['word_count'] = input_df['text'].apply(lambda x: len(x.split()))
        input_df['sentence_length'] = input_df['text'].apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].apply(lambda x: len(set(x.split())))

        input_df['word_count2'] = [len(t.split(' ')) for t in input_df.text]
        input_df['num_unq_words']=[len(list(set(x.lower().split(' ')))) for x in input_df.text]
        input_df['num_chars']= [len(x) for x in input_df.text]

        # Additional features
        input_df['avg_word_length'] = input_df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].apply(lambda x: x.count(';'))

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        input_df['exclamation_count'] = input_df['text'].apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].apply(lambda x: x.count('?'))
        input_df['pos_ratios'] = input_df['text'].apply(self.calculate_pos_ratios)

        # Convert the dictionary of POS ratios into a single value (mean)
        input_df['pos_mean'] = input_df['pos_ratios'].apply(lambda x: np.mean(list(x.values())))
        input_df['punctuation_ratios'] = input_df['text'].apply(self.calculate_punctuation_ratios)

        # Convert the dictionary of punctuation ratios into a single value (sum)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))
        input_df['keyword_density'] = input_df.apply(self.calculate_keyword_density, axis=1)
        input_df['jaccard_similarity'] = input_df.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        tqdm.pandas(desc="Performing Sentiment Analysis")
        input_df[['sentiment_polarity', 'sentiment_subjectivity']] = input_df['text'].progress_apply(
            lambda x: pd.Series(self.sentiment_analysis(x))
        )
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)
        #Calculate sentiment scores for each row
        input_df['sentiment_scores'] = input_df['text'].apply(self.calculate_sentiment_scores)
        
        input_df['gunning_fog'] = input_df['text'].apply(self.gunning_fog)
        input_df['flesch_kincaid_grade_level'] = input_df['text'].apply(self.flesch_kincaid_grade_level)
        input_df['count_difficult_words'] = input_df['text'].apply(self.count_difficult_words)

        # Convert sentiment_scores into individual columns
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].apply(self.calculate_sentiment_scores)
        # Convert sentiment_scores_prompt into individual columns
        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        columns =  ['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']
        cols_to_drop = [col for col in columns if col in input_df.columns]
        if cols_to_drop:
            input_df = input_df.drop(columns=cols_to_drop)
        
        print(cols_to_drop)
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

## Group by grade instead of prompt_id

In [13]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

# Calculate the number of unique groups
n_unique_groups = train["grade"].nunique()

# Set n_splits to be the smaller of CFG.n_splits and the number of unique groups
n_splits = min(CFG.n_splits, n_unique_groups)
CFG.n_splits = n_splits

gkf = GroupKFold(n_splits=n_splits)
for i, (_, val_index) in enumerate(gkf.split(train, groups=train["grade"])):
    train.loc[val_index, "fold"] = i

100%|██████████| 7165/7165 [00:00<00:00, 8350.51it/s]
100%|██████████| 7165/7165 [00:00<00:00, 9213.43it/s]
100%|██████████| 7165/7165 [00:01<00:00, 5164.98it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4443.31it/s]
100%|██████████| 7165/7165 [00:00<00:00, 81607.23it/s]
Performing Sentiment Analysis: 100%|██████████| 7165/7165 [00:05<00:00, 1418.45it/s]
Calculating Text Similarity: 100%|██████████| 7165/7165 [00:23<00:00, 302.73it/s]


['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']


Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 6662.91it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 1724.63it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 2435.01it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 2287.28it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 2048.75it/s]
Performing Sentiment Analysis: 100%|██████████| 4/4 [00:00<00:00, 1106.82it/s]
Calculating Text Similarity: 100%|██████████| 4/4 [00:00<00:00, 243.96it/s]

['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']





In [14]:
train

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,count_difficult_words,neg,neu,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,6,0.033,0.832,0.135,0.7845,0.027,0.873,0.100,0.9915,0.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,0,0.000,0.946,0.054,0.4310,0.086,0.879,0.035,-0.9949,2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,16,0.047,0.814,0.139,0.9725,0.063,0.845,0.092,0.9283,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,3,0.000,1.000,0.000,0.0000,0.063,0.845,0.092,0.9283,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,13,0.000,0.896,0.104,0.9696,0.027,0.873,0.100,0.9915,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,76,9,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,3,0.031,0.881,0.088,0.4601,0.086,0.879,0.035,-0.9949,2.0
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,49,7,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,2,0.167,0.784,0.049,-0.6808,0.063,0.845,0.092,0.9283,1.0
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,51,10,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,2,0.000,0.939,0.061,0.4404,0.063,0.845,0.092,0.9283,1.0
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,63,5,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,...,0,0.105,0.527,0.368,0.9715,0.229,0.628,0.143,-0.9969,0.0


In [15]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,title,author,...,flesch_kincaid_grade_level,count_difficult_words,neg,neu,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    return (content_score + wording_score)/2

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True    
    
seed_everything(seed=42)

# Bertav3large

In [17]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

In [18]:
oof_bert_large = pd.read_csv('/kaggle/input/commonlit-deberta-v3-large-models/oof_df.csv')
print(oof_bert_large.shape)
oof_bert_large.head()

(7165, 4)


Unnamed: 0,content,wording,pred_content,pred_wording
0,1.658802,-0.09315,0.836136,0.151125
1,-1.638511,-0.911973,-1.300229,-0.71605
2,0.205682,0.380538,0.19242,0.089422
3,-1.355562,-0.955801,-1.039375,-0.897786
4,0.205682,0.380538,-0.006736,0.227909


In [19]:
s = score_loss(np.array(oof_bert_large[['content' , 'wording']]) , np.array(oof_bert_large[['pred_content' , 'pred_wording']]))
s

{'mcrmse_score': 0.46615662019200665,
 'Content_score': 0.39647407605816687,
 'Wording_score': 0.5358391643258464}

In [20]:
class cfg:
    select = 'large'
    model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    only_model_name = f'deberta-v3-{select}'
    fold = 4
    batch_size = 32
    freezing = True
    max_len = 1024
    pooling = 'GemText'
    path = '/kaggle/input/commonlit-deberta-v3-large-models/'
    #path = '/kaggle/input/commonlit-deberta-hidden-layers-mean/'
    #path = '/kaggle/input/commonlit-baseline/'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    seed = 42
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.path+'tokenizer/')
cfg.device , cfg.model_name

(device(type='cuda'), '/kaggle/input/deberta-v3-large/deberta-v3-large')

In [21]:
def get_logger(filename='Inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg.seed)


LOGGER.info(f"max_len: {cfg.max_len}")
LOGGER.info(f"batch_size: {cfg.batch_size}")
LOGGER.info(f"Model name: {cfg.only_model_name}")

In [22]:
prompts_test1 = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_test1 = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
submission1 = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')

test1 = prompts_test1.merge(summary_test1, on="prompt_id")
test1

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3
2,def789,Summarize...,Example Title 2,Heading\nText...,111111eeeeee,Example text 2
3,def789,Summarize...,Example Title 2,Heading\nText...,333333dddddd,Example text 4


In [23]:
test1['full_text']=test1['prompt_question'] +" " + cfg.tokenizer.sep_token +" "+test1['text']
test1['full_text']

0    Summarize... [SEP] Example text 1
1    Summarize... [SEP] Example text 3
2    Summarize... [SEP] Example text 2
3    Summarize... [SEP] Example text 4
Name: full_text, dtype: object

In [24]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

In [25]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret



def get_pooling_layer():
    if cfg.pooling == 'Mean':
        return MeanPooling()
    
    elif cfg.pooling == 'Max':
        return MaxPooling()
    
    elif cfg.pooling == 'MeanMax':
        return MeanMaxPooling()
    
    elif cfg.pooling == 'GemText':
        return GeMText()


print(get_pooling_layer())

GeMText()


In [26]:
class BaselineModel1(nn.Module):
    def __init__(self, model_name ):
        super(BaselineModel1, self).__init__()
        
        self.model = AutoModel.from_pretrained(cfg.model_name)
        self.config = AutoConfig.from_pretrained(cfg.model_name)
        #self.drop = nn.Dropout(p=0.2)
        self.pooler = get_pooling_layer()

        if cfg.pooling == 'MeanMax':
            self.fc = nn.Linear(2*self.config.hidden_size, 2)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 2)
            
        
        self._init_weights(self.fc)
        
        if cfg.freezing:
            top_half_layer_freeze(self.model)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
           
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        #out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [27]:
class TestDataset(Dataset):
    def __init__(self,df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.pq = df['prompt_question'].values
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        text =   self.text[index]
        full_text = pq+" " + self.tokenizer.sep_token +" "+text
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                        
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long)
        }
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [28]:
@torch.no_grad()
def test_run(model , loader):   
    model.eval()
       
    preds = []
    bar = tqdm(enumerate(loader), total=len(loader))
    for idx , data in bar:
        inputs = collate(data)
        ids   =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask  =  inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        y_preds = model(ids , mask)
        preds.append(y_preds.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    
    return predictions
    

In [29]:
test_dataset = TestDataset(test1)
test_loader = DataLoader(test_dataset , batch_size=cfg.batch_size ,num_workers=2, shuffle=False, pin_memory=True)

In [30]:
final_preds = []
for fold in range(cfg.fold):
    print('******** fold' , fold , '********')
    
    model  = BaselineModel1(cfg.model_name).to(cfg.device)
    model.load_state_dict(torch.load(f"/kaggle/input/commonlit-deberta-v3-large-models/deberta-v3-{cfg.select}_Fold_{fold}.pth", map_location=torch.device('cpu')))
    preds = test_run(model, test_loader)
    final_preds.append(preds)
    del model ; gc.collect()
    torch.cuda.empty_cache()
final_preds_ = np.mean(final_preds, axis=0)

******** fold 0 ********


100%|██████████| 1/1 [00:02<00:00,  2.32s/it]


******** fold 1 ********


100%|██████████| 1/1 [00:00<00:00,  5.29it/s]


******** fold 2 ********


100%|██████████| 1/1 [00:00<00:00,  4.38it/s]


******** fold 3 ********


100%|██████████| 1/1 [00:00<00:00,  4.92it/s]


In [31]:
## НАС ИНТЕРЕСУЮТ
final_preds_

array([[-1.409746 , -1.039957 ],
       [-1.4451523, -1.0264542],
       [-1.4189626, -1.0212438],
       [-1.4166303, -1.0149112]], dtype=float32)

In [32]:
final_preds_test_bert_large = pd.DataFrame(data=final_preds_, columns=["content_pred_debertav3large", "wording_pred_debertav3large"])

# Bert3vbase

In [33]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import optuna
import optuna.integration.lightgbm as lgb
import pyphen
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
from nltk import ne_chunk, word_tokenize, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [34]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from datasets import Dataset

# Пулинги, такие как MaxPooling, ClsPooling и GeMPooling, 
# Широко используются в нейронных сетях для уменьшения размерности данных и извлечения наиболее важных признаков

# выполняет усреднение эмбеддингов последнего скрытого состояния из last_hidden_state
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    # last_hidden_state - последний скрытый состояние, 
    # attention_mask - маска внимания
    
    # Метод forward является обязательным в классах,которые наследуются от nn.Module в библиотеке PyTorch. 
    # В этом методе определяется последовательность операций,которые модель должна выполнить при прямом проходе (forward pass).
    
    def forward(self, last_hidden_state, attention_mask):
        # используя unsqueeze и expand, маска внимания расширяется до того же размера, что и last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        
        # clamp используется для предотвращения деления на ноль
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e9
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
#  выполняет операцию пулинга только для первого токена последнего скрытого состояния (last_hidden_state)    
class ClsPooling(nn.Module):
    def __init__(self):
        super(ClsPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        return last_hidden_state[:,0,:]
    
class MeanMax(nn.Module):
    def __init__(self):
        super(MeanMax, self).__init__()
        
        self.mean_pooler = MeanPooling()
        self.max_pooler  = MaxPooling()
        
    def forward(self, last_hidden_state, attention_mask):
        mean_pooler = self.mean_pooler( last_hidden_state ,attention_mask )
        max_pooler =  self.max_pooler( last_hidden_state ,attention_mask )
        out = torch.concat([mean_pooler ,max_pooler ] , 1)
        return out

# операция Generalized Mean Pooling (GeM) для текстовых данных   
class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        # размерность
        self.dim = dim
        # показатель степени (p) 
        self.p = Parameter(torch.ones(1) * p)
        # и эпсилон (eps)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        # вычисление формулы GeM
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret
    
def get_pooling_layer(pooling):
    if pooling == 'Mean':
        return MeanPooling()
    
    elif pooling == 'Max':
        return MaxPooling()
    
    elif pooling == 'MeanMax':
        return MeanMax()
    
    elif pooling == 'GemText':
        return GeMText()
    
    elif pooling == 'ClsPooling':
        return ClsPooling()

In [35]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.modeling_outputs import SequenceClassifierOutput 

class BaselineModel(nn.Module):
    def __init__(self, config, model_name="bert", ft=True):
        super(BaselineModel, self).__init__()
        if ft:
            self.model = AutoModel.from_pretrained(model_name)
        else:
            self.model = AutoModel.from_config(config)
        self.config = config
        self.problem_type = "regression"
        self.num_labels = 2
        self.pooler = get_pooling_layer(config.pooling)
            
        if config.pooling == 'MeanMax':
            # cоздается полносвязный линейный слой nn.Linear с размером входного пространства, 
            # равным удвоенному значению self.config.hidden_size, 
            # и размером выходного пространства, равным 2.
            self.fc = nn.Linear(2*self.config.hidden_size, 2)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 2)
    
    # это входные параметры
    def forward(
        self,
        # input_ids- идентификаторы токенов входной последовательности.
        input_ids=None,
        attention_mask=None,
        # token_type_ids-идентификаторы типов токенов, которые используются для разделения последовательностей, если модель поддерживает множественные последовательности.
        token_type_ids=None, 
        # position_ids- идентификаторы позиций токенов в последовательности.
        position_ids=None,        
        #маска для скрытия определенных "heads" в модели (значение None означает использование всех "heads")
        head_mask=None,        
        #предварительно вычисленное вложение токенов, которое можно использовать вместо input_ids
        inputs_embeds=None,       
        #метки целевых значений для задачи обучения с учителем (например, классификации).
        labels=None,
        # флаг, указывающий модели сохранять сохранять внимание в выходных значениях.
        output_attentions=None,
        # флаг, указывающий модели сохранять скрытые состояния в выходных значениях
        output_hidden_states=False,
        # флаг, указывающий, следует ли возвращать выходные значения в виде словаря.
        return_dict=True):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
#             head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        #  outputs.last_hidden_state - contextual embeddings входного текста
        # self.pooler - объединяет эмбеддинги для получения представления фиксированного размера
        out = self.pooler(outputs.last_hidden_state, attention_mask)
        # линейный слой для получения конечных выходных логитов
        logits = self.fc(out)

        loss = None
        if labels is not None:
            if self.problem_type is None:
                if self.num_labels == 1:
                    self.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.problem_type = "single_label_classification"
                else:
                    self.problem_type = "multi_label_classification"

            if self.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
                
        # если значение переменной return_dict равно False, то возвращается кортеж (logits,) с добавленными значениями из outputs начиная с индекса 2    
        # если loss не равно None, то возвращается кортеж ((loss,) + output), в противном случае возвращается кортеж output.
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        
        # Если значение переменной return_dict равно True, то функция возвращает объект класса SequenceClassifierOutput с аргументами loss, logits, hidden_states и attentions
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [36]:
# функция обучения батча
def train_n_infer(train,
                  val,
                  model_name="roberta-base",
                  preproc_type=1,
                  batch_size=4,
                  learning_rate=5e-5,
                  warmup_ratio=0,
                  weight_decay=0.01,
                  hidden_dropout_prob=0.0,
                  attention_probs_dropout_prob=0.0,
                  num_layers_to_freeze=0,
                  num_train_epochs=2,
                  save_steps=100,
                  random_seed=42,
                  max_length=512,
                  model_dir="bert"):
    
    
    # отбираем нужные нам поля из датасета
    train_content = train[["text", "prompt_question", "prompt_text", 'prompt_title',"content", "wording"]]
    val_content = val[["text", "prompt_question", "prompt_text", 'prompt_title', "content", "wording"]]
    test_ = test[["text", "prompt_question", "prompt_text", 'prompt_title']]
    
    
    # импортируем токенайзер и конфиг модели. Конфиг изменяем под задачу регрессии, тк будем использовать Trainer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)
    # эти параметры config используются Trainer для дотюнивания модели
    model_config.update({
        "pooling": CFG.pooling,
        'num_labels': 2,
        'problem_type': 'regression'})
        
    seed_everything(seed=42) 
    # импортируем модель
    model_content = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config)
    # пользовательская реализация модели
    model_content = BaselineModel(config=model_config, model_name=model_name) 
    
    # фризим num_layers_to_freeze в модели
    for name, param in list(model_content.named_parameters())[: num_layers_to_freeze]:
        param.requires_grad = False
    
    # импортируем датасет из pandas в промежуточный формат
    train_dataset_content = Dataset.from_pandas(train_content, preserve_index=False) 
    val_dataset_content = Dataset.from_pandas(val_content, preserve_index=False) 
    
    # токенизация текста, обычной выборки
    def tokenize_function(examples):
        # это токен-разделитель, tokenizer уже инициализирован
        sep_token = tokenizer.sep_token
        labels = [examples["content"], examples["wording"]]
        
        if preproc_type == 1:
            example = examples["text"]
        elif preproc_type == 2:
            # объединение строки "prompt_question", sep_token и строки "text" из examples
            example = f"{examples['prompt_question']} {sep_token} {examples['text']}"

        elif preproc_type == 3:
            example = f"{examples['prompt_question']} {sep_token} {examples['text']}{sep_token} {examples['prompt_text']}"
        
        elif preproc_type == 4:
            example = f"{examples['prompt_title']} {sep_token} {examples['prompt_question']} {sep_token} {examples['text']}"
        elif preproc_type == 5:
            example = f"{examples['prompt_question']} {sep_token} {examples['prompt_title']}{sep_token} {examples['text']}"
        elif preproc_type == 6:
            example = f"{examples['text']} {sep_token} {examples['prompt_text ']}"
        # tokenizer уже инициализирован
        tokenized = tokenizer(example,
                         padding=False,
                         truncation=True,
                         max_length=max_length)       
        
        #tokenized.update({feature: value for feature, value in zip(numerical_features, numerical_values)})
    
        
        return {
            **tokenized,
            "labels": labels,       
        }
    
    # токенизация текста, тестовой выборки
    def tokenize_function_test(examples):
        sep_token = tokenizer.sep_token
 
        if preproc_type == 1:
            example = examples["text"]
        elif preproc_type == 2:
            example = f"{examples['prompt_question']} {sep_token} {examples['text']}"
        elif preproc_type == 3:
            example = f"{examples['prompt_question']} {sep_token} {examples['text']} {sep_token} {examples['prompt_text']}"
        elif preproc_type == 4:
            example = f"{examples['prompt_title']} {sep_token} {examples['prompt_question']} {sep_token} {examples['text']}"
        elif preproc_type == 5:
            example = f"{examples['prompt_question']} {sep_token} {examples['prompt_title']}{sep_token} {examples['text']}"
        elif preproc_type == 6:
            example = f"{examples['text']} {sep_token} {examples['prompt_text ']}"
         # example: текстовая строка для токенизации
        tokenized = tokenizer(example,
                        # padding=False: отключает добавление заполнения
                         padding=False,        
                       # truncation=True: разрешает обрезку текста, если он превышает максимальную длину
                         truncation=True,     
                        # max_length: максимальная длина токенизированной последовательности.
                         max_length=max_length)
        return tokenized
    
    # Токенизируем трейн сразу,чтобы не делать это при обучении
    # это уже датасет, к которому применен токенайзер
    train_tokenized_datasets_content = train_dataset_content.map(tokenize_function, batched=False)
    val_tokenized_datasets_content = val_dataset_content.map(tokenize_function, batched=False)
  
    # Токенизируем тест (аналогично)
    test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
    test_tokenized_dataset = test_dataset.map(tokenize_function_test, batched=False)
    
    #test_tokenized_dataset['length_ratio']=test_tokenized_dataset['text_length'] / test_tokenized_dataset['prompt_length']
    
    # создаем итератор с токенизатором
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # model_dir=bert
    model_dir = model_dir
    
    # инициализируем аргументы трейнера, передав model_dir и другие данные из конфига
    training_args = TrainingArguments(
        # model_dir=bert
        output_dir=model_dir,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
#       gradient_accumulation_steps=64,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        report_to='none',
        greater_is_better=False,
        save_strategy="steps",
        evaluation_strategy="steps",
        eval_steps=save_steps,
        save_steps=save_steps,
        metric_for_best_model="mcrmse",
        save_total_limit=1
    )
       # создание трейнера 
    #  ВАЖНО- model_content(инициализирован раньше и содержит CFG) помещается в  trainer_content 
    trainer_content = Trainer(
        model=model_content,
        args=training_args,
        train_dataset=train_tokenized_datasets_content,
        eval_dataset=val_tokenized_datasets_content,
        tokenizer=tokenizer,
        compute_metrics=compute_mcrmse,
        data_collator=data_collator)
   
    # СТАРТУЕМ ОБУЧЕНИЕ
    trainer_content.train()
    best_check = os.listdir(model_dir)[0]
    
    # производим inference на тестовую выборку (model_content)
    model_content = BaselineModel(config=model_config, ft=False)
    model_content.load_state_dict(torch.load(f"{model_dir}/{best_check}/pytorch_model.bin"))
    model_content.eval()

    # задает конфигурацию для тестирования модели
    test_args = TrainingArguments(
        #  путь к директории, в которой будут сохранены результаты тестирования модели
        output_dir=model_dir,
        # False означает, что тестирование модели не будет сопровождаться обучением
        do_train = False,
        do_predict = True,
        # размер пакета (batch size) для оценки модели на каждом устройстве
        per_device_eval_batch_size = 4,  
        # False означает, что последний неполный пакет данных не будет отброшен при оценке модели.
        dataloader_drop_last = False,
    )
    # init trainer
    infer_content = Trainer(
                  model = model_content, 
                  tokenizer=tokenizer,
                  data_collator=data_collator,
                  args = test_args)
    # обученную модель натравливаем на тестовые данные
    val_results_content = infer_content.predict(val_tokenized_datasets_content)[0]
    test_results_content = infer_content.predict(test_tokenized_dataset)[0]
    
    # сохраняем модель
#     model_content.save_pretrained(model_dir)

    torch.save(model_content.state_dict(), f"{model_dir}/pytorch_model.bin")
    tokenizer.save_pretrained(model_dir)
    with open(f"{model_dir}/config.json", "w") as file:
        json.dump(model_config.to_dict(), file)
    
    shutil.rmtree(f"{model_dir}/{best_check}")

    return val_results_content, test_results_content

In [37]:
train.head(1)

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,count_difficult_words,neg,neu,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,6,0.033,0.832,0.135,0.7845,0.027,0.873,0.1,0.9915,0.0


In [38]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,title,author,...,flesch_kincaid_grade_level,count_difficult_words,neg,neu,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,1.313333,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
def get_oof_pred_n_test(train,
                        model_name="/kaggle/input/berttiny",
                        preproc_type=1,
                        n_splits=3,
                        batch_size=4,
                        learning_rate=5e-5,
                        warmup_ratio=0,
                        hidden_dropout_prob=0.0,
                        attention_probs_dropout_prob=0.0,
                        num_layers_to_freeze=0,
                        weight_decay=0.01,
                        num_train_epochs=2,
                        random_seed=42,
                        save_steps=100,
                        max_length=512
                       ):
    kfl = GroupKFold(n_splits=n_splits)
    oof_content = np.zeros((len(train), 2))
    test_pred_content = np.zeros((len(test), 2))

    model_name_ = model_name.split("/")[-1]
    if os.path.exists(model_name_):
        shutil.rmtree(model_name_)
    os.mkdir(model_name_)
    
    pred_dict=[]
    for i, (train_indx, val_indx) in enumerate(kfl.split(train, groups=train["prompt_id"])):
        print(f"fold {i}:")
        
        train_ = train.iloc[train_indx]
        val_ = train.iloc[val_indx]
        val_res_content, test_res_content  = train_n_infer(train_,
                                                           val_,
                                                           model_name=model_name,
                                                           preproc_type=preproc_type,
                                                           batch_size=batch_size,
                                                           learning_rate=learning_rate,
                                                           warmup_ratio=warmup_ratio,
                                                           hidden_dropout_prob=hidden_dropout_prob,
                                                           attention_probs_dropout_prob=attention_probs_dropout_prob,
                                                           num_layers_to_freeze=num_layers_to_freeze,
                                                           weight_decay=weight_decay,
                                                           num_train_epochs=num_train_epochs,
                                                           save_steps=save_steps,
                                                           max_length=max_length,
                                                           random_seed=random_seed,
                                                           model_dir=f"{model_name_}/fold_{i}"
                                                           )

        oof_content[val_indx] += val_res_content
        test_pred_content = test_pred_content+test_res_content/n_splits
        
        pred_dict.append(test_pred_content)       
    
    oof_train = pd.DataFrame(oof_content, columns=[f"content_pred_{model_name_}", f"wording_pred_{model_name_}"])
    test_pred = pd.DataFrame(test_pred_content, columns=[f"content_pred_{model_name_}", f"wording_pred_{model_name_}"])
    
    display('test_pred',test_pred )
    
    train.loc[:, [f"content_pred_{model_name_}", f"wording_pred_{model_name_}" ] ] = oof_content
    #test=pd.concat([test, test_pred[[f"content_pred_{model_name_}", f"wording_pred_{model_name_}"]]], axis=1)
    
    
    cv_metric = compute_mcrmse((oof_train.values, train[["content", "wording"]]))
    cv_metric["preproc_type"] = preproc_type
    print(f"cv mcrmse: {cv_metric}")
    with open(f"{model_name_}/cv_metric.json", "w") as outfile:
        json.dump(cv_metric, outfile)
    oof_train.to_csv(f"{model_name_}/oof_train.csv", index=False)
    
    return oof_train, pred_dict, test_pred


In [40]:
oof_train, pred_dict, test_pred  = get_oof_pred_n_test(train,
                                            model_name=CFG.model_name,
                                            preproc_type=CFG.preproc_type,
                                            learning_rate=CFG.learning_rate,
                                            warmup_ratio=CFG.warmup_ratio,
                                            hidden_dropout_prob=CFG.hidden_dropout_prob,
                                            attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
                                            num_layers_to_freeze=CFG.num_layers_to_freeze,
                                            weight_decay=CFG.weight_decay,
                                            num_train_epochs=CFG.num_train_epochs,
                                            n_splits=CFG.n_splits,
                                            batch_size=CFG.batch_size,
                                            random_seed=CFG.random_seed,
                                            save_steps=CFG.save_steps,
                                            max_length=CFG.max_length
                                           )

test=pd.concat([test, test_pred[['content_pred_debertav3base', 'wording_pred_debertav3base']]], axis=1)

fold 0:


Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
75,No log,0.444445,0.56623,0.753838,0.660034
150,No log,0.330486,0.510638,0.632629,0.571634
225,No log,0.293478,0.447347,0.621961,0.534654
300,No log,0.278754,0.460047,0.588103,0.524075
375,No log,0.248177,0.40587,0.575868,0.490869
450,No log,0.32122,0.419784,0.682803,0.551294
525,0.425200,0.243258,0.433594,0.546364,0.489979
600,0.425200,0.2775,0.409735,0.622187,0.515961
675,0.425200,0.26423,0.473515,0.551582,0.512549
750,0.425200,0.28303,0.423461,0.621886,0.522673


fold 1:


Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
75,No log,0.599278,0.687909,0.851667,0.769788
150,No log,0.976132,0.828362,1.125203,0.976782
225,No log,0.757827,0.70743,1.00757,0.8575
300,No log,0.55227,0.585351,0.872871,0.729111
375,No log,0.647895,0.622232,0.953214,0.787723
450,No log,0.531379,0.594192,0.842433,0.718313
525,0.395600,0.544913,0.595119,0.857706,0.726412
600,0.395600,0.50684,0.522585,0.860572,0.691579
675,0.395600,0.475758,0.556611,0.801062,0.678836
750,0.395600,0.526155,0.54968,0.866118,0.707899


fold 2:


Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
75,No log,0.568231,0.617272,0.869158,0.743215
150,No log,0.540975,0.664079,0.800593,0.732336
225,No log,0.437319,0.569496,0.74183,0.655663
300,No log,0.424212,0.578901,0.716448,0.647675
375,No log,0.423773,0.559758,0.730902,0.64533
450,No log,0.461308,0.618956,0.734513,0.676735
525,0.408900,0.384622,0.542801,0.68892,0.61586
600,0.408900,0.415901,0.587855,0.697302,0.642578
675,0.408900,0.383384,0.52401,0.701556,0.612783
750,0.408900,0.409173,0.593443,0.682768,0.638105


'test_pred'

Unnamed: 0,content_pred_debertav3base,wording_pred_debertav3base
0,-1.749442,-1.373426
1,-1.739721,-1.36867
2,-1.775637,-1.398801
3,-1.763253,-1.384465


cv mcrmse: {'content_rmse': 0.5082982027308606, 'wording_rmse': 0.6490997096034177, 'mcrmse': 0.5786989561671392, 'preproc_type': 4}


In [41]:
test=pd.concat([test, final_preds_test_bert_large], axis=1)

oof_bert_large=oof_bert_large.rename(columns={"pred_content": "content_pred_debertav3large",
                                              "pred_wording": "wording_pred_debertav3large"})

train=pd.concat([train, oof_bert_large[['content_pred_debertav3large', 
                                        'wording_pred_debertav3large']]], axis=1)

In [42]:
sample_submission["content"] = test_pred.values[:, 0]
sample_submission["wording"] = test_pred.values[:, 1]

sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.749442,-1.373426
1,111111eeeeee,-1.739721,-1.36867
2,222222cccccc,-1.775637,-1.398801
3,333333dddddd,-1.763253,-1.384465


In [43]:
train

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt,fold,content_pred_debertav3base,wording_pred_debertav3base,content_pred_debertav3large,wording_pred_debertav3large
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,0.7845,0.027,0.873,0.100,0.9915,0.0,-0.095889,0.499332,0.836136,0.151125
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,0.4310,0.086,0.879,0.035,-0.9949,2.0,-0.731909,-0.289967,-1.300229,-0.716050
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,32,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,0.9725,0.063,0.845,0.092,0.9283,1.0,2.155892,2.109928,0.192420,0.089422
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,5,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,0.0000,0.063,0.845,0.092,0.9283,1.0,-0.731285,-0.838597,-1.039375,-0.897786
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,29,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,0.9696,0.027,0.873,0.100,0.9915,0.0,1.709265,1.803383,-0.006736,0.227909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,76,9,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,0.4601,0.086,0.879,0.035,-0.9949,2.0,-0.288010,-0.267796,-0.363451,-0.328194
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,49,7,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,-0.6808,0.063,0.845,0.092,0.9283,1.0,-0.153699,-0.074170,-0.835059,-0.137106
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,51,10,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,0.4404,0.063,0.845,0.092,0.9283,1.0,-0.618377,-0.463432,0.335535,0.958109
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,63,5,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,...,0.9715,0.229,0.628,0.143,-0.9969,0.0,0.030218,0.400325,-0.360416,-0.371727


In [44]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,title,author,...,pos,compound,neg_prompt,neu_prompt,pos_prompt,compound_prompt,content_pred_debertav3base,wording_pred_debertav3base,content_pred_debertav3large,wording_pred_debertav3large
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.749442,-1.373426,-1.409746,-1.039957
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.739721,-1.36867,-1.445152,-1.026454
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.775637,-1.398801,-1.418963,-1.021244
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.763253,-1.384465,-1.41663,-1.014911


## Boosting

In [45]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre"
               ] + targets

import optuna
import lightgbm as lgb

def objective(trial, X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
    dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
    max_depth = trial.suggest_int('max_depth', 2, 10) # 2, 10
    params = {
        'boosting_type': 'goss', # 'gbdt',
        'random_state': 0, # 42,
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), # 0.01, 0.1
        'max_depth': max_depth,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 2.5), # 10.0
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 2.5), # 10.0
        'num_leaves': trial.suggest_int('num_leaves', 2, 2**max_depth - 1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9), # added
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5), # added
        'n_estimators': trial.suggest_int('n_estimators', 50, 350), # added
        'verbosity': -1  # Add this line to suppress warnings and info messages
    }

    evaluation_results = {}
    model = lgb.train(params,
                      num_boost_round=70000, # 10000
                      valid_names=['train', 'valid'],
                      train_set=dtrain,
                      valid_sets=dval,
                      verbose_eval=1000,
                      early_stopping_rounds=80, # 30
                      callbacks=[lgb.record_evaluation(evaluation_results)])

    # Use the last metric for early stopping
    evals_result = model.best_score
    last_metric = list(evals_result.values())[-1]
    trial.set_user_attr('best_model', model)  # Save the model in the trial
    return last_metric[list(last_metric.keys())[-1]]

model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):
        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, X_train_cv, y_train_cv, X_eval_cv, y_eval_cv), n_trials=1000) # 100
        print('FOLD: ', fold)
        print('Best trial: score {}, params {}'.format(study.best_value, study.best_params))

        best_model = study.trials[study.best_trial.number].user_attrs['best_model']
        models.append(best_model)
    
    model_dict[target] = models

Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[88]	train's rmse: 0.471688
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[195]	train's rmse: 0.564102
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[300]	train's rmse: 0.457959
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[47]	train's rmse: 0.465537
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[345]	train's rmse: 0.462964
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[243]	train's rmse: 0.456761
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[44]	train's rmse: 0.472095
Training until validation scores don't improve for 80 rounds
Did not meet ea

In [46]:
model_dict

{'content': [<lightgbm.basic.Booster at 0x78fafb102440>,
  <lightgbm.basic.Booster at 0x78fafa6ea9e0>,
  <lightgbm.basic.Booster at 0x78fafb1830d0>],
 'wording': [<lightgbm.basic.Booster at 0x78fafb331000>,
  <lightgbm.basic.Booster at 0x78fafa9c5b70>,
  <lightgbm.basic.Booster at 0x78fafb3c20e0>]}

In [47]:
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.45844892064052306
wording_rmse : 0.5876285245851536
mcrmse : 0.5230387226128383


# Submission

In [48]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre"]

In [49]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
    
        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds
    
    
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    medians = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].median(axis=1)

    std_devs = test[[f'{target}_pred_{fold}' for fold in range(CFG.n_splits)]].std(axis=1)
    
    adjusted_medians = medians + (CFG.adjustment_factor * std_devs)

    test[target] = adjusted_medians

    display(test)

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,title,author,...,pos_prompt,compound_prompt,content_pred_debertav3base,wording_pred_debertav3base,content_pred_debertav3large,wording_pred_debertav3large,content_pred_0,content_pred_1,content_pred_2,content
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,0.0,0.0,-1.749442,-1.373426,-1.409746,-1.039957,-1.598788,-1.727969,-1.269791,-1.480669
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,0.0,0.0,-1.739721,-1.36867,-1.445152,-1.026454,-1.598788,-1.727969,-1.269791,-1.480669
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,0.0,0.0,-1.775637,-1.398801,-1.418963,-1.021244,-1.598788,-1.727969,-1.269791,-1.480669
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,0.0,0.0,-1.763253,-1.384465,-1.41663,-1.014911,-1.598788,-1.727969,-1.269791,-1.480669


Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,title,author,...,content_pred_debertav3large,wording_pred_debertav3large,content_pred_0,content_pred_1,content_pred_2,content,wording_pred_0,wording_pred_1,wording_pred_2,wording
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,-1.409746,-1.039957,-1.598788,-1.727969,-1.269791,-1.480669,-1.338332,-0.742696,-1.421624,-1.15319
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,-1.445152,-1.026454,-1.598788,-1.727969,-1.269791,-1.480669,-1.338332,-0.742696,-1.421624,-1.15319
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,,,...,-1.418963,-1.021244,-1.598788,-1.727969,-1.269791,-1.480669,-1.338332,-0.742696,-1.421624,-1.15319
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,,,...,-1.41663,-1.014911,-1.598788,-1.727969,-1.269791,-1.480669,-1.338332,-0.742696,-1.421624,-1.15319


In [50]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)