In [1]:
# from transformers import EncoderDecoderModel
from transformers import EncoderDecoderConfig, EncoderDecoderModel
from transformers import AutoConfig
from transformers import AutoTokenizer
import torch
from transformers import RobertaForMaskedLM, AutoModel, RobertaModel,RobertaConfig
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
device = torch.device("cuda")
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments,DataCollatorForSeq2Seq,default_data_collator

In [2]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained('../input/transformer-distilation-pre/roberta_base_6layers_student','../input/transformer-distilation-pre/roberta_base_6layers_student')

Some weights of RobertaForCausalLM were not initialized from the model checkpoint at ../input/transformer-distilation-pre/roberta_base_6layers_student and are newly initialized: ['encoder.layer.3.crossattention.self.query.bias', 'encoder.layer.3.crossattention.self.value.weight', 'encoder.layer.4.crossattention.self.query.bias', 'encoder.layer.4.crossattention.output.LayerNorm.weight', 'encoder.layer.3.crossattention.output.LayerNorm.bias', 'encoder.layer.2.crossattention.output.dense.bias', 'encoder.layer.5.crossattention.output.dense.bias', 'encoder.layer.4.crossattention.output.dense.weight', 'encoder.layer.4.crossattention.output.LayerNorm.bias', 'encoder.layer.0.crossattention.output.dense.weight', 'encoder.layer.5.crossattention.output.LayerNorm.weight', 'lm_head.layer_norm.weight', 'encoder.layer.0.crossattention.output.LayerNorm.weight', 'encoder.layer.1.crossattention.self.value.bias', 'encoder.layer.5.crossattention.self.value.weight', 'encoder.layer.1.crossattention.output.d

In [3]:
df = pd.read_csv('/kaggle/input/news-summary/news_summary_more.csv')
df.head(1)

# padding=True' 'truncation=True'

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."


In [4]:
class SummaryDataset(Dataset):
    def __init__(self, df,tokenizer,):
        self.df = df
        self.tokenizer= tokenizer
        self.max_length1 = 512
        self.max_length2 = 32
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.text.iloc[idx]
        headlines = self.df.headlines.iloc[idx]
        inputs_text = self.tokenizer(text,padding='max_length',max_length=self.max_length1,truncation=True)
        headlines_text = self.tokenizer(headlines,padding='max_length',max_length=self.max_length2,truncation=True)
        out = {}
        out["input_ids"] = inputs_text.input_ids
        out["attention_mask"] = inputs_text.attention_mask
        out["decoder_input_ids"] = headlines_text.input_ids
        out["decoder_attention_mask"] = headlines_text.attention_mask
        out["labels"] = headlines_text.input_ids.copy()

        # because roberta automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
        # We have to make sure that the PAD token is ignored
        out["labels"] = [-100 if token == tokenizer.pad_token_id else token for token  in out["labels"]]
        return out
    
    

In [5]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base","roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.encoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 32
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 5

In [7]:
model.load_state_dict(torch.load("/kaggle/input/roberta6lseq2seq/pytorch_model.bin"))

<All keys matched successfully>

In [8]:
# debug = True
# if debug:
#     df = df.sample(frac=0.1,random_state=200) 
train_df=df.sample(frac=0.9,random_state=200)
val_df=df.drop(train_df.index)

# train_dataset = SummaryDataset(train_df,tokenizer=tokenizer)
# val_dataset = SummaryDataset(val_df,tokenizer=tokenizer)

In [9]:
val_df = val_df.sample(100)

In [10]:
# val_df

In [11]:
train_dataset = SummaryDataset(train_df,tokenizer=tokenizer)
val_dataset = SummaryDataset(val_df,tokenizer=tokenizer)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir='RobertaBase6LSeq2Seq',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    save_strategy = "epoch",
    logging_steps=1024,
    load_best_model_at_end=True,
    warmup_steps=1024,  
    learning_rate = 5e-5,
    num_train_epochs = 5, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=2,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
!pip install datasets
!pip install rouge_score

[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=9c754d9a8b4ae441efdbd41beed4b5a180fbb2accac156f8d1538b999ec7876f
  Stored in directory: /root/.cache/pip/wheels/8e/6b/70/59daa7c90a238610e34bac5916e001fe3d9bb0ec59c8cf5518
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

In [14]:
import datasets
rouge = datasets.load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [15]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
trainer = Seq2SeqTrainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [18]:
trainer.train()

***** Running training *****
  Num examples = 88561
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27680
  Number of trainable parameters = 178472025
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,2.6973,2.312825,0.2142,0.2016,0.2054
2,2.0488,1.951148,0.2451,0.2299,0.2348
3,1.6658,1.816726,0.2501,0.2395,0.2429
4,1.4182,1.744985,0.2574,0.2534,0.2541
5,1.2415,1.731204,0.2707,0.2661,0.2671


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

  "You have modified the pretrained model configuration to control generation. This is a"
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_st

TrainOutput(global_step=27680, training_loss=2.164596729058062, metrics={'train_runtime': 20139.1608, 'train_samples_per_second': 21.987, 'train_steps_per_second': 1.374, 'total_flos': 1.366744019427072e+17, 'train_loss': 2.164596729058062, 'epoch': 5.0})

In [19]:
trainer.save_model('RobertaBaseSeq2Seq')

Saving model checkpoint to RobertaBaseSeq2Seq
Configuration saved in RobertaBaseSeq2Seq/config.json
Configuration saved in RobertaBaseSeq2Seq/generation_config.json
Model weights saved in RobertaBaseSeq2Seq/pytorch_model.bin
tokenizer config file saved in RobertaBaseSeq2Seq/tokenizer_config.json
Special tokens file saved in RobertaBaseSeq2Seq/special_tokens_map.json


In [20]:
# _model.encoder.save_pretrained("./encoder")
# _model.decoder.save_pretrained("./decoder")

# model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
#     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
# )
# # This is only for copying some specific attributes of this particular model.
# model.config = _model.config

In [21]:
text = val_df['text'].values[0]
summary = val_df['headlines'].values[0]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Michael Phelps, wife Nicole Phelps become parents for 2nd time</s>
actual:23-time Olympic champion Phelps becomes father for 2nd time


In [22]:
text = val_df['text'].values[2]
summary = val_df['headlines'].values[2]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>I mean Justin Bieber was chewing gum: Rohit Roy</s>
actual:Bieber was chewing gum, drinking water while singing: Rohit


In [23]:
text = val_df['text'].values[3]
summary = val_df['headlines'].values[3]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Tamil film makers delete 'objectionable' scenes after AIADMK protests</s>
actual:Tamil film 'Sarkar' makers delete scenes after protest: Report


In [24]:
text = val_df['text'].values[4]
summary = val_df['headlines'].values[4]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>80 outlets in east India shut down due to weak food: Bakshi</s>
actual:McDonald's outlets in east India shut due to supply crunch


In [25]:
text = val_df['text'].values[5]
summary = val_df['headlines'].values[5]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Delhi groom shoots at baraat, returns to Delhi wedding venue</s>
actual:Groom shot at in Delhi, returns to wedding with bullet in shoulder


In [26]:
text = val_df['text'].values[7]
summary = val_df['headlines'].values[7]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>66-year-old woman sneaks onto plane without ticket, arrested</s>
actual:Woman sneaks past security, flies to London without ticket


In [27]:
text = val_df['text'].values[6]
summary = val_df['headlines'].values[6]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>BJP beats CPI(M) in Tripura to form first-ever Tripura govt</s>
actual:BJP wins Tripura for 1st time, displaces CPI(M) after 25 yrs


In [28]:
text = val_df['text'].values[8]
summary = val_df['headlines'].values[8]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Religious women's honour held hostage for decades: Prez Kovind</s>
actual:Muslim women's dignity held hostage for decades: Prez Kovind


In [29]:
text = val_df['text'].values[9]
summary = val_df['headlines'].values[9]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Bengaluru Metro covers Hindi words on signboards</s>
actual:Hindi words on signboards at B'luru Metro stations covered 


In [30]:
text = val_df['text'].values[10]
summary = val_df['headlines'].values[10]
out = tokenizer(text,padding='max_length',max_length=512,return_tensors="pt").to('cuda')
generated_summary = tokenizer.decode(model.generate(**out)[0])
print(f'Genrated:{generated_summary}')
print(f'actual:{summary}')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 32,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Genrated:<s><s>Not afraid of killing Gauri Lankesh's killers: Prakash Raj</s>
actual:Not afraid of threats from Lankesh's killers: Prakash Raj
