In [None]:
import pandas as pd
import numpy as np
import random 
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM 

In [None]:
SEED = 0

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

input = '/kaggle/input/title-generation'
output ='/kaggle/working'


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
df = pd.read_csv(f'{input}/train.csv')

TRAIN_SPLIT = int(len(df) * 0.97)
df.sample(frac=1).reset_index(drop=True)

train_iter = df.iloc[:TRAIN_SPLIT]
val_iter = df.iloc[TRAIN_SPLIT:]


In [None]:
a = train_iter.abstract.str.split().agg(len)
a.hist()

In [None]:
model_checkpoint ='t5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples: pd.DataFrame):
    inputs = [prefix + doc for doc in examples["abstract"].values]
    model_inputs = tokenizer(inputs, max_length=400, truncation=True,)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(list(examples["title"].values), max_length=128, truncation=True, )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

class Sec2SecDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = preprocess_function(data)
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in  self.data.items()}
    def __len__(self):
        return len(list(self.data.values())[0])

In [None]:
train_dataset = Sec2SecDataset(train_iter)
val_dataset = Sec2SecDataset(val_iter)

train_dataloader = DataLoader(train_dataset, batch_size=20, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=20, collate_fn=data_collator)

In [None]:
from transformers.optimization import AdamW

optimizer = AdamW(model.parameters(), lr = 1e-4, weight_decay =0.000000001)
model = model.to(device)
model.load_state_dict(torch.load('/kaggle/input/titlegeneration-transformers/title-gen_model.pt', map_location=device))

In [None]:
from IPython.display import clear_output
import matplotlib.pyplot as plt

# Будем сохранять loss во время обучения
# и рисовать график в режиме реального времени
train_loss_set = []
train_loss = 0

val_loss_set = []
x_val_set = []



# Обучение
# Переводим модель в training mode
model.train()


for step, batch in enumerate(train_dataloader):
    model.train()
    # добавляем батч для вычисления на GPU
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Распаковываем данные из dataloader
    #b_input_ids, b_input_mask, b_labels, b_sent_ids = batch
    
    # если не сделать .zero_grad(), градиенты будут накапливаться
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(**batch)

    train_loss_set.append(outputs.loss.item())  
    
    # Backward pass
    outputs.loss.backward()
    
    # grad step
    optimizer.step()

    #  loss
    train_loss += outputs.loss.item()
    
    #Evaluation
    if step % 500 == 0:
        val_loss = 0
        for val_batch in val_dataloader:
            val_batch = {k: v.to(device) for k, v in val_batch.items()}
            model.eval()
            with torch.no_grad():
                outputs =  model(**val_batch)
                val_loss += outputs.loss.item()
        val_loss_set.append(val_loss/len(val_dataloader))
        x_val_set.append(step)
    
    # Рисуем график
    clear_output(True)
    plt.plot(train_loss_set)
    plt.plot(x_val_set,val_loss_set)
    plt.title("Training loss")
    plt.xlabel("Batch")
    plt.ylabel("Loss")
    plt.show()
    
print("Loss на обучающей выборке: {0:.5f}".format(train_loss / len(train_dataloader)))
print("Loss на валидационной выборке: {0:.5f}".format(val_loss / len(val_dataloader)))

In [None]:
torch.save(model.state_dict(), 'title-gen_model.pt')

In [None]:
from transformers import LogitsProcessorList, LogitsProcessor

class MyLogProcessor(LogitsProcessor):
    def __init__(self, encoder_inputs):
        self.encoder_inputs_mask = torch.zeros((encoder_inputs.shape[0] * 5 , 32128))
        for i in range (encoder_inputs.shape[0]):
            self.encoder_inputs_mask[i:i+5, encoder_inputs[i]] = 1
    def __call__(self, input_ids, scores):
        if input_ids.shape[1] < 4:
            #print(scores[self.encoder_inputs_mask == 0].mean(-1, keepdims=True))
            scores[self.encoder_inputs_mask == 0] = -100#scores[self.encoder_inputs_mask == 0]  - scores[self.encoder_inputs_mask == 0].mean(-1, keepdims=True).abs()/2
        return scores
        

def predict_with_model(model, iterator, tokenizer,  device = None):
    try:
        model.eval()
        device = model.device
        condidate_corpus = []
        ref_corpus = []
        inputs = []

        for batch in tqdm(iterator):
            batch = {k: v.to(device) for k, v in batch.items()}
            #log_processor = LogitsProcessorList([MyLogProcessor(batch["input_ids"])])
            with torch.no_grad():
                out = model.generate(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    do_sample=False, num_beams=5, max_length = 30,) #logits_processor = log_processor)
            condidate_corpus +=tokenizer.batch_decode(out,skip_special_tokens=True)
            ref_corpus += tokenizer.batch_decode(batch['decoder_input_ids'],skip_special_tokens=True)
            inputs += tokenizer.batch_decode(batch["input_ids"],skip_special_tokens=True)
            
        return condidate_corpus, ref_corpus, inputs
    except KeyboardInterrupt:
        return condidate_corpus, ref_corpus, inputs
        

In [None]:
condidat_corpus, ref_corpus, inputs = predict_with_model(model, val_dataloader, tokenizer)

In [None]:
from torchtext.data.metrics import bleu_score
ref= [[r.split()] for r in ref_corpus]
pred = [c.split() for c in condidat_corpus]

bleu_score(pred, ref, max_n=3, weights=[0.34, 0.33, 0.33])

In [None]:
for i in range(30):
    print(inputs[i], '\n')
    print(ref_corpus[i])
    print(condidat_corpus[i])
    print('')

In [None]:
submission_data = pd.read_csv('/kaggle/input/title-generation/test.csv')
submission_data['title'] = 'title title'

In [None]:
submission_data

In [None]:

test_dataset = Sec2SecDataset(submission_data)
test_iterator = DataLoader(test_dataset, batch_size=10, collate_fn = data_collator)

titles, _, _ = predict_with_model(model, test_iterator, tokenizer)
submission_data['title'] = titles
submission_data.to_csv('predicted_titles.csv', index=False)
    
#ubmission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
#ubmission_df.to_csv('predicted_titles.csv', index=False)

In [None]:
submission_data

In [None]:
res = submission_data.join(train_iter.drop_duplicates(subset=['abstract']).set_index('abstract'), on = 'abstract', lsuffix='_pred', ).reset_index(drop=True)
res.title = res.title.fillna(res.title_pred)
res.pop('title_pred')
res.to_csv('predicted_titles.csv', index=False)
res

In [None]:
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='/kaggle/input/title-generation/vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        try:
            trg = row['title']
            trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
            if len(trg) < 2:
                trg *= 2
            trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])

            VOCAB_stoi = vocs[row_idx]
            trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
            trg_vec = np.zeros(len(VOCAB_stoi))    

            for word in trg_intersection:
                trg_vec[VOCAB_stoi[word]] = 1

            with open(output_file, 'a') as res_file:
                for is_word in trg_vec:
                    res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                    output_idx += 1
        except:
            print(1)

generate_csv()