In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data preprocessing

In [None]:
COMMON_PATH = "/kaggle/input/commonlit-evaluate-student-summaries/"

In [None]:
summary_df = pd.read_csv(COMMON_PATH + "summaries_train.csv")
summary_df.head()

In [None]:
prompt_df = pd.read_csv(COMMON_PATH + "prompts_train.csv")
prompt_df.head()

In [None]:
df = summary_df.merge(prompt_df, on="prompt_id", how='inner')
df.head()

In [None]:
df.drop(['student_id', 'prompt_id'], axis=1, inplace=True)
df.head()

## NLP stuff

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
model_path_or_name = '../input/huggingface-bert/bert-base-cased'
bert_model = AutoModel.from_pretrained(model_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
tokenizer.tokenize("Hello, is it me you're looking for?")

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [None]:
class RawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        self.data_text_format = """
            Task: Question-Conditioned Text Summarization

            Question: "{}"

            Given the following text, provide a concise summary that answers the question:

            Text: "{}"

            Summary: {}

            ---
            """
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'text']
        question = self.df.loc[idx, 'prompt_question'][10:]
        fulltext = self.df.loc[idx, 'prompt_title'] + '\n' + self.df.loc[idx, 'prompt_text']
        
        try:
            target_1 = self.df.loc[idx, 'content']
            target_2 = self.df.loc[idx, 'wording']

            target = [float(target_1), float(target_2)]
        except:
            target = [0,0]
        
        content = self.data_text_format.format(question, fulltext, text)
        
        inputs = self.tokenizer.encode_plus(content, truncation = True, padding=True, max_length=self.max_length, return_tensors='pt')
        
        input_ids = inputs['input_ids'].squeeze()
        
        return {
            'input_ids': input_ids,
            'targets': torch.tensor(target, dtype=torch.float)
        }
    

### Let's try to see the distribution of the number of tokens

In [None]:
dataset = RawDataset(df, tokenizer, None)
lens = [dataset[i]['input_ids'].shape[0] for i in range(len(dataset))]

import matplotlib.pyplot as plt 
plt.hist(lens, density=True)

In [None]:
len(dataset), len(df)

In [None]:
MAX_LEN = 512

In [None]:
dataset = RawDataset(df, tokenizer, MAX_LEN)
dataset[2]['input_ids'].shape

In [None]:
lens = [dataset[i]['input_ids'].shape[0] for i in range(len(dataset))]

import matplotlib.pyplot as plt 
plt.hist(lens, density=True)

## Deep Learning Model

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda:0")
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device=get_device()
device

In [None]:
bert_model

## Freezing layers from BERT

In [None]:
for param in bert_model.parameters():
    param.requires_grad = False

In [None]:
import torch.nn as nn 

class RegressionModel(nn.Module):
    def __init__(self, model):
        super(RegressionModel, self).__init__()
        self.model = model
        self.linear = nn.Linear(768, 2)
        
    def forward(self, x):
        y = self.model(x).last_hidden_state
        y = y[:,-1,:]
        y = self.linear(y)
        return y

model = RegressionModel(bert_model).to(device=device)

## Training / Transfer Learning

In [None]:
from tqdm.notebook import tqdm

In [None]:
num_epochs = 3

# Create dataset and data loader
dataset = RawDataset(df, tokenizer, max_length=MAX_LEN)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Fine-tune the model (use an appropriate optimizer and loss function)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.MSELoss()

model.train()

for epoch in range(num_epochs):
    iterr = tqdm(data_loader, f"epoch {epoch+1}/{num_epochs}")
    for batch in iterr:
        inputs = batch['input_ids'].to(device=device)
        targets = batch['targets'].to(device=device)
        
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

## Inference

In [None]:
COMMON_PATH

In [None]:
summary_test = pd.read_csv(COMMON_PATH + 'summaries_test.csv')
prompt_test = pd.read_csv(COMMON_PATH + 'prompts_test.csv')

df_test = summary_test.merge(prompt_test, on="prompt_id", how='inner')
df.head()

dataset = RawDataset(df_test, tokenizer, max_length=MAX_LEN)
data_loader = DataLoader(dataset, batch_size=64, shuffle=False)

model.eval()

In [None]:
df_test.head()

In [None]:
device

In [None]:
predictions = []

with torch.no_grad():
    iterr = tqdm(data_loader)
    for batch in iterr:
        inputs = batch['input_ids'].to(device=device)
        
        outputs = model(inputs)
        print(outputs.shape)
        predictions.append(outputs.cpu())
        
predictions

In [None]:
len(predictions)

In [None]:
predictions[0].shape, predictions[1].shape

In [None]:
concatenated_predictions = torch.cat(predictions, dim=0)
concatenated_predictions.shape

In [None]:
submission = df_test
submission[['content', 'wording']] = concatenated_predictions.numpy()
submission = submission.drop(['prompt_id', 'text', 'prompt_question', 'prompt_title', 'prompt_text'], axis=1)
submission

In [None]:
submission.to_csv('submission.csv' ,index=False)