In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm

!pip install /kaggle/input/insructor-base-wg/InstructorEmbedding-1.0.1-py2.py3-none-any.whl
!cp -r /kaggle/input/insructor-base-wg/sentence-transformers-2.2.2/sentence-transformers-2.2.2 /tmp
!pip install /tmp/sentence-transformers-2.2.2

In [None]:
import warnings
from InstructorEmbedding import INSTRUCTOR

warnings.filterwarnings("ignore")

instructor_path = '/kaggle/input/insructor-base-wg/instructor-base/kaggle/working/instructor-base'

model_emb = INSTRUCTOR(instructor_path)

In [None]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset

def getDevice():
    if torch.cuda.is_available():
        return torch.device("cuda:0")
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device=getDevice()

import torch.nn.functional as F

class CrossAttentionMergeModel(nn.Module):
    def __init__(self, hidden_size):
        super(CrossAttentionMergeModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.linear = nn.Sequential(nn.Linear(hidden_size*3 + 2, hidden_size), nn.ReLU(),
                                    nn.Linear(hidden_size, 128), nn.ReLU(),
                                    nn.Linear(128, 2))
    
    def forward(self, inputs):
        
        v1,v2,v3 = inputs[:,0,:], inputs[:,1,:], inputs[:,2,:]

        s1 = F.cosine_similarity(v1, v2, dim=-1).view(-1,1)
        s2 = F.cosine_similarity(v1, v3, dim=-1).view(-1,1)

        similarity = torch.concat([s1, s2], axis=-1)
       
        x = inputs.view(-1, self.hidden_size*3)
        
        x = torch.concat([x, similarity], axis=-1)
        
        out = self.linear(x)
        
        return out

In [None]:
model_path = '/kaggle/input/model-weitghs-lr/att_model'

model_pre = CrossAttentionMergeModel(768)
model_pre.to(device)
model_pre.eval()

In [None]:
class RawDataset(Dataset):
    def __init__(self, X, Y=None):
        
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        
        x = self.X[idx]
        y = np.array([0,0])
        
        if self.Y is not None:
            y = self.Y[idx]
        
        x = torch.from_numpy(x)
        
        return {
            'inputs': x,
            'targets': y
        }

instruction_big   = "Represent the Tittle and Text for Summary:"
instruction_small = "Represent the Student Summary:"
instruction_quest = "Represent the question:"

def emmbedd_df(df, model_emb, model_pre, SPLITS=10):
    
    output_list = []
    
    batch = []
    batch_size = 64
    
    itter = tqdm(range(len(df)), 'emedding df')
    
    model_emb.eval()
    model_pre.eval()
    
    
    with torch.no_grad():
        for i in itter:
            summry = df.loc[i, 'text']
            question = df.loc[i, 'prompt_question']
            fulltext = f"{df.loc[i, 'prompt_title']}\n {df.loc[0, 'prompt_text']}"

            input_embeddings = [[instruction_small, summry], [instruction_quest , question], [instruction_big ,fulltext]]

            batch += input_embeddings

            if (i%batch_size == 0 and i > 0) or i >= len(df)-1:
                embeddings = model_emb.encode(batch).reshape(-1,3,768)
                
                outputs = 0
                
                for k in range(SPLITS):
                    model_pre.load_state_dict(torch.load(model_path+f"_f{k}.pt", map_location=device))
                
                    outputs += model_pre(torch.from_numpy(embeddings).to(device=device)).cpu().numpy()
                
                outputs /= SPLITS
                
                output_list += [outputs]
                batch.clear()
    
    return output_list

In [None]:
# Read the summaries train file
summariesTest = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

# Read the prompts train file
promptsTest = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')


# Train
dfTest = summariesTest.merge(promptsTest,on="prompt_id").fillna('Empty')


del summariesTest
del promptsTest

print ('encoding data to submmit')

final_predictions = emmbedd_df(dfTest, model_emb, model_pre)

del model_emb
del model_pre

In [None]:
final_predictions = np.concatenate(final_predictions, axis=0)

In [None]:
target_columns = ['content', 'wording']

dfTest[target_columns] = final_predictions

del final_predictions

In [None]:
subdmission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
subdmission.drop(target_columns, axis=1, inplace=True)

subdmission = subdmission.merge(dfTest[['student_id'] + target_columns], on='student_id', how='left')

subdmission['content'] = subdmission['content'].clip(lower=-2.0, upper=5)
subdmission['wording'] = subdmission['wording'].clip(lower=-2.0, upper=5)

In [None]:
subdmission.to_csv('submission.csv', index=False)

In [None]:
print('submission saved')