In [None]:
pip install rouge_score

In [None]:
pip install transformers

In [None]:
pip install evaluate

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Imoprt all necessary libraries
import transformers
from datasets import Dataset, DatasetDict
from evaluate import load
import numpy as np
import pandas as pd
import torch
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danii\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Load training data
data = pd.read_csv('/content/gdrive/MyDrive/Title_generation/train.csv')
# Load rouge score
metric = load('rouge')
# Initialize path to model
model_checkpoints = '/content/gdrive/MyDrive/Title_generation/my_model'
# Load model's tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [None]:
# Get max lenght of summary
len(tokenizer.encode(data.abstract.max(), return_tensors='pt')[0])

186

In [None]:
# Get max lenght of titles
len(tokenizer.encode(data.title.max(), return_tensors='pt')[0])

19

In [None]:
# Split data into train and validation sets
train = Dataset.from_pandas(data[:125000])
val = Dataset.from_pandas(data[125000:])
# Convert data sets into DatesetDict
data = DatasetDict({'train':train, 'validation':val})

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['abstract', 'title'],
        num_rows: 120000
    })
    validation: Dataset({
        features: ['abstract', 'title'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['abstract', 'title'],
        num_rows: 5000
    })
})

In [None]:
# Set max input and target lenghts
max_input = 256
max_target = 32

In [None]:
def preprocess_data(data_to_process):
    # Get all texts
    inputs = [text for text in data_to_process['abstract']]
    # Tokenize them
    model_inputs = tokenizer(inputs, max_length=max_input, padding='max_length', truncation=True)
    # With target tokenizer
    with tokenizer.as_target_tokenizer():
        # Tokenize all titles
        targets = tokenizer(data_to_process['title'], max_length=max_target, padding='max_length', truncation=True)
    # Replace lables in model inputs with targets ids
    model_inputs['labels'] = targets['input_ids']
    
    return model_inputs

In [None]:
# Tokenize data
tokenized_data = data.map(preprocess_data, batched=True, remove_columns=['abstract', 'title'])

  0%|          | 0/120 [00:00<?, ?ba/s]



  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
# Download model
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# Initialize collator
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Set batch size
batch_size = 18

In [None]:
def compute_rouge(pred):
    # Get model prediction and target labels tokens
    predictions, labels = pred
    # Decode predictions and labels
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute model's performance
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    res = {key: value*100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k,v in res.items()}

In [None]:
# Define model's arguments
args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    eval_accumulation_steps=1
)

In [None]:
# Define trainer
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [None]:
# Train model
trainer.train()

In [None]:
# Save trained model
trainer.save_model("/content/gdrive/MyDrive/Title_generation/my_model_2_epoch")

In [None]:
# Load test data
test_data = pd.read_csv('test.csv')

In [None]:
# Get all texts
texts = test_data.abstract.values

In [None]:
# For each text
for i in range(len(texts)):
    # Tokenize text
    model_inputs = tokenizer(texts[i], max_length=max_input, padding='max_length', truncation=True)
    # Generate title
    pred, _, _ = trainer.predict([model_inputs])
    # Replace text with decoded title
    texts[i] = tokenizer.decode(pred[0], skip_special_tokens=True)

In [None]:
# Load model and tokenizer
path = './my_model'
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(path)
tokenizer = transformers.AutoTokenizer.from_pretrained(path)

In [None]:
test = pd.read_csv('test.csv')
pred = pd.read_csv('predictions.csv')

In [None]:
abstracts = test.abstract.values
titles = pred.abstract.values

In [None]:
# Save orginal texts and generated titles into .csv format
submission_df = pd.DataFrame({'abstract': abstracts, 'title': titles})
submission_df.to_csv('predicted_titles.csv', index=False)

In [None]:
# Generate kaggle submission
import string
from nltk.util import ngrams
import numpy as np
import pandas as pd
import pickle


def generate_csv(input_file='predicted_titles.csv',
                 output_file='submission.csv',
                 voc_file='vocs.pkl'):
    '''
    Generates file in format required for submitting result to Kaggle
    
    Parameters:
        input_file (str) : path to csv file with your predicted titles.
                           Should have two fields: abstract and title
        output_file (str) : path to output submission file
        voc_file (str) : path to voc.pkl file
    '''
    data = pd.read_csv(input_file)
    with open(voc_file, 'rb') as voc_file:
        vocs = pickle.load(voc_file)

    with open(output_file, 'w') as res_file:
        res_file.write('Id,Predict\n')
        
    output_idx = 0
    for row_idx, row in data.iterrows():
        trg = row['title']
        trg = trg.translate(str.maketrans('', '', string.punctuation)).lower().split()
        trg.extend(['_'.join(ngram) for ngram in list(ngrams(trg, 2)) + list(ngrams(trg, 3))])
        
        VOCAB_stoi = vocs[row_idx]
        trg_intersection = set(VOCAB_stoi.keys()).intersection(set(trg))
        trg_vec = np.zeros(len(VOCAB_stoi))    

        for word in trg_intersection:
            trg_vec[VOCAB_stoi[word]] = 1

        with open(output_file, 'a') as res_file:
            for is_word in trg_vec:
                res_file.write('{0},{1}\n'.format(output_idx, int(is_word)))
                output_idx += 1


generate_csv()