In [1]:
import numpy as np 
import pandas as pd 
import torch
import random
import torch.nn as nn
import os
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



In [2]:
data_path = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(data_path + "prompts_train.csv")
prompts_test = pd.read_csv(data_path + "prompts_test.csv")
summaries_train = pd.read_csv(data_path + "summaries_train.csv")
summaries_test = pd.read_csv(data_path + "summaries_test.csv")
sample_submission = pd.read_csv(data_path + "sample_submission.csv")

train = summaries_train.merge(prompts_train, on="prompt_id")
test = summaries_test.merge(prompts_test, on="prompt_id")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/debertav3base")

model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/debertav3base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/debertav3base and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# EDA

In [4]:
# Feature Engeneering
# Text Word Count
train['text_word_count'] = train['text'].apply(lambda x: len(x.split()))

train['summary_text_word_count'] = train['prompt_text'].apply(lambda x: len(x.split()))

train['summary_to_original_ratio'] = train['text_word_count'] / train['summary_text_word_count']

# Test
test['text_word_count'] = test['text'].apply(lambda x: len(x.split()))

test['summary_text_word_count'] = test['prompt_text'].apply(lambda x: len(x.split()))

test['summary_to_original_ratio'] = test['text_word_count'] / test['summary_text_word_count']

In [5]:
#from transformers import pipeline
# Sentiment Analysis
# Obtendremos la diferencia en el tono entre el resumen y el texto original

#sentiment_analysis = pipeline('sentiment-analysis') # device = 0, con CUDA

#def get_sentiment(text):
    #result = sentiment_analysis(text)
    #if result[0]['label'] = 'POSITIVE':
        #return result[0]['score']
    #else:
        #return -result[0]['score']
    
#train['sentiment_text'] = train['text'].apply(get_sentiment)
#train['sentiment_prompt'] = train['prompt_text'].apply(get_sentiment)
#train['sentiment_difference'] = train['sentiment_prompt'] - train['sentiment_text']

In [6]:
class DatasetSummary(Dataset):
    def __init__(self,data,tokenizer):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        
        self.features = self.data[['summary_to_original_ratio', 'text_word_count']].values
        self.text = self.data["text"].tolist()
        self.text = self.get_token(self.text)                
        
    def __getitem__(self,index):
        input_ids = self.text['input_ids'][index]
        attention_mask = self.text['attention_mask'][index]
        
        if 'content' not in self.data.columns:
            return {'input_ids':input_ids,
                   'attention_mask':attention_mask,
                   'features' : torch.tensor(self.features[index], dtype=torch.float32)}
        else:
            content = self.data["content"].tolist()[index]
            wording = self.data["wording"].tolist()[index]

            return {'input_ids' : input_ids,
                    'attention_mask': attention_mask,
                    'content' : content,
                    'wording' : wording,
                    'features' : torch.tensor(self.features[index], dtype=torch.float32)}
            
            
    def __len__(self):
        return len(self.data['text'])
    
    def get_token(self,text):
        return self.tokenizer.batch_encode_plus(text,
                                         padding=True,
                                         truncation=True,
                                         max_length=512,
                                         return_tensors="pt")


In [7]:
batch_size = 12

target = ['content','wording']
datas = ['text', 'text_word_count', 'summary_to_original_ratio']

data = train.loc[:,datas]
label = train.loc[:,target]

train_data,val_data,train_label,val_label = train_test_split(data,label,test_size=0.2,random_state=42)

train_data = pd.concat([train_data,train_label],axis=1)
val_data = pd.concat([val_data,val_label],axis=1)

train_dataset = DatasetSummary(train_data,tokenizer)
train_loader = DataLoader(train_dataset,shuffle=False,batch_size=batch_size)
    
val_dataset = DatasetSummary(val_data,tokenizer)
val_loader = DataLoader(val_dataset,shuffle=False,batch_size=batch_size)

In [8]:
next(iter(train_loader))

{'input_ids': tensor([[    1, 17246,   262,  ...,     0,     0,     0],
         [    1,   279,   728,  ...,     0,     0,     0],
         [    1,   279, 52789,  ...,     0,     0,     0],
         ...,
         [    1,   450,   338,  ...,     0,     0,     0],
         [    1,   816, 11647,  ...,     0,     0,     0],
         [    1,  9339,   728,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'content': tensor([-0.9106, -0.6643,  0.3884,  0.7507, -1.3325,  0.2057, -1.6385, -0.7826,
         -0.3933,  0.3884,  1.5797,  0.2915], dtype=torch.float64),
 'wording': tensor([-0.0818, -0.5107, -0.7180, -0.1295, -1.0056,  0.3805, -0.9120, -0.2460,
          0.6271, -0.7180,  1.7133,  1.0426], dtype=torch.float64),
 'features': tensor([[7.1429e-02, 6.9000e+01],
      

In [9]:
class Deberta(nn.Module):
    def __init__(self, deberta, feature_dim):
        super(Deberta, self).__init__()
        
        self.deberta = deberta
        
        self.feature_backbone = nn.Sequential(
            nn.Linear(feature_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(2 + 32, 64),  # Ajustando la dimensión de entrada según el output de feature_backbone y deberta
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
        
    def forward(self, input_ids, attention_mask, features):
        x = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        x = x[0].type(torch.float32)
        feature = self.feature_backbone(features)
        
        # Fusionamos
        combined = torch.cat([x, feature], dim=1)
        combined = self.classifier(combined)
        return combined

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Deberta(model, 2).to(device)
optim = torch.optim.Adam(model.parameters(),lr=1.5e-5)
criterion = nn.MSELoss()

In [11]:
#epochs = 30

#model.train()

#for epoch in range(epochs):
    #running_loss = 0
    #step = 0
    #for data in train_loader:
        #input_ids = data['input_ids'].to(device)
        #attention_mask = data['attention_mask'].to(device)
        #content = data['content'].type(torch.float32).to(device)
        #wording = data['wording'].type(torch.float32).to(device)
        #features = data['features'].type(torch.float32).to(device)

        #optim.zero_grad()
        #outputs = model(input_ids, attention_mask, features)
        #loss = criterion(outputs[:,0],content) + criterion(outputs[:,1],wording)
        #loss.backward()
        #optim.step()
        #if step % 500 == 0:
            #print("Epoch {}, Step {}, Loss: {}".format(epoch+1, step, loss.item()))

        #running_loss += loss.item()
        #step = step + 1

    #print(f"Epoch {epoch+1} Loss: {running_loss / len(train_loader)}")
        
    #model.eval()
    #with torch.no_grad():
        #val_loss = 0.0
        #step = 0
        #for data in val_loader:
            #input_ids = data['input_ids'].to(device)
            #attention_mask = data['attention_mask'].to(device)
            #content = data['content'].type(torch.float32).to(device)
            #wording = data['wording'].type(torch.float32).to(device)
            #features = data['features'].type(torch.float32).to(device)
            
            #outputs = model(input_ids,attention_mask, features)
            #val_loss+=criterion(outputs[:,0],content)+criterion(outputs[:,1],wording)
                
        #print(f"Validation Loss: {val_loss / len(val_loader)}")
    #model.train()


In [12]:
#model.eval()
#predict = []

#test_dataset = DatasetSummary(test,tokenizer)
#test_loader = DataLoader(test_dataset,shuffle=False,batch_size=batch_size)

#with torch.no_grad():
    #for data in test_loader:
        #input_ids = data['input_ids'].to(device)
        #attention_mask = data['attention_mask'].to(device)
        #features = data['features'].to(device)
        
        #outputs = model(input_ids,attention_mask,features)
        #predict.extend(outputs.cpu().numpy())

In [13]:
#submission = pd.DataFrame({
    #'student_id':test['student_id'],
    #'content':[pred[0] for pred in predict],
    #'wording':[pred[1] for pred in predict]
#}) 
#submission.to_csv('submission.csv',index=False)

In [14]:
#submission