Load the Required Libraries

In [None]:
import numpy as np
import pandas as pd
import spacy

from collections import Counter, OrderedDict
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

Preparing the Dataset

In [None]:
### Installing keybert library to extract keywords from the sentences
!pip install keybert

In [None]:
from keybert import KeyBERT

In [None]:
#loading en_core_web_sm in spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
### Loading Training dataset
path = '/content/drive/MyDrive/Stories.csv'
df= pd.read_csv(path)
df.columns

In [None]:
 df.head()

In [None]:
new_df= df[['storytitle', 'sentence1', 'sentence2', 'sentence3','sentence4', 'sentence5']]

In [None]:
def parse_stories_texts(data):
    kw_model = KeyBERT()
    stories_texts = []
    for story in tqdm(data.itertuples(), total=len(data)):

      # print (story)

      sentences = [story.sentence1, story.sentence2, story.sentence3, story.sentence4, story.sentence5]
      # print (sentences)

      text = ' '.join(sentences)

      # print (text)

      scores = Counter(dict(kw_model.extract_keywords(text, top_n=100000)))

      # print (scores)
      
      stories_text = OrderedDict()
      for k, sentence in enumerate(sentences, 1):
          candidates = Counter()
          for token in nlp(sentence):
              if token.lower_ in scores and token.lemma_ not in stories_text:
                  candidates[token.lemma_] = scores[token.lower_]
          if candidates:
              word = candidates.most_common(1)[0][0]
              stories_text[word] = k

      stories_texts.append({
          'storyid': story.storyid,
          'stories_text': ', '.join(stories_text.keys()),
          'length': len(stories_text) })
        
    return pd.DataFrame(stories_texts)

def parse_storylines(stories, top_n=100000):
    kw_model = KeyBERT()
    storylines = []
    for row in tqdm(stories.itertuples(), total=len(stories)):

      # print (row)

      sentences = [row.sentence1, row.sentence2, row.sentence3, row.sentence4, row.sentence5]
      # print (sentences)

      text = ' '.join(sentences)

      # print (text)

      scores = Counter(dict(kw_model.extract_keywords(text, top_n=top_n)))

      # print (scores)
      
      storyline = OrderedDict()
      for k, sentence in enumerate(sentences, 1):
          candidates = Counter()
          for token in nlp(sentence):
              if token.lower_ in scores and token.lemma_ not in storyline:
                  candidates[token.lemma_] = scores[token.lower_]
          if candidates:
              word = candidates.most_common(1)[0][0]
              storyline[word] = k

      storylines.append({
          'storyid': row.storyid,
          'storyline': ', '.join(storyline.keys()),
          'length': len(storyline) })
        
    return pd.DataFrame(storylines)

In [None]:
stl_df = parse_storylines(df)

In [None]:
## Saving the processed file
stl_df.to_excel('/content/drive/MyDrive/stl_df.xlsx',index=False)

------------------------------------------------------------------------------ Training

#### Read train file and prepare final Training Data:

In [4]:
## Read train file and prepare final Training Data:

import pandas as pd

stl_df= pd.read_excel('/content/drive/MyDrive/stl_df.xlsx')
orig_stories = pd.read_csv('/content/drive/MyDrive/Stories.csv')

In [5]:
stl_df.head(2)

Unnamed: 0,storyid,storyline,length
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,"David, habit, eat, diet, week",5
1,0beabab2-fb49-460e-a6e6-f35a202e3348,"Tom, guest, punch, quickly, couch",5


In [6]:
orig_stories.head(2)

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...


In [7]:
### Mapping extracted keywords with original story data
final_df= pd.merge(orig_stories,stl_df, on= 'storyid')

In [8]:
final_df.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5,storyline,length
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet...","David, habit, eat, diet, week",5
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...,"Tom, guest, punch, quickly, couch",5
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...,"Marcus, clothe, khakis, pair, event",5
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...,"Bobby, truck, vehicle, thinking, buy",5
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.,"pastor, sermon, sing, music, congregation",5


In [9]:
final_df.shape

(52665, 9)

Loading the saved processed data again, to Create final Target variable

In [10]:
## Creating final Target variable [joining all stories together]

final_df= pd.merge(orig_stories,stl_df, on= 'storyid')
final_df['target'] = final_df[['sentence1','sentence2','sentence3','sentence4','sentence5']].apply(" ".join, axis=1)

final_df['input_text'] =  final_df['storyline'] + ' <st0> ' +final_df['storytitle'] + ' <st1>'
final_df['target_text'] = final_df['target']
# final_df['prefix'] = '<prefix>:'
# fnl_df= final_df[['prefix','input_text','target_text']]
fnl_df= final_df[['input_text','target_text']]

In [11]:
  fnl_df.head(2)

Unnamed: 0,input_text,target_text
0,"David, habit, eat, diet, week <st0> David Drop...",David noticed he had put on a lot of weight re...
1,"Tom, guest, punch, quickly, couch <st0> Frustr...",Tom had a very short temper. One day a guest m...


###Trainig Model

In [12]:
#### Installing Transformer library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [13]:
## Splitting traina nd test data
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(fnl_df, test_size=0.2)
train_df.shape, test_df.shape

((42132, 2), (10533, 2))

### Changing the data format as per T5 Base architecture

In [14]:
## Changing the data format as per T5 Base architecture

train_df.new_inp= [{'input': val} for val in train_df.input_text]
test_df.new_inp= [{'input': val} for val in test_df.input_text]

  train_df.new_inp= [{'input': val} for val in train_df.input_text]
  test_df.new_inp= [{'input': val} for val in test_df.input_text]


In [15]:
train_df.new_out= [{'ouput': val} for val in train_df.target_text]
test_df.new_out= [{'ouput': val} for val in test_df.target_text]

  train_df.new_out= [{'ouput': val} for val in train_df.target_text]
  test_df.new_out= [{'ouput': val} for val in test_df.target_text]


In [16]:
new_dct1= []
for i in range(len(list(zip(train_df.new_inp, train_df.new_out)))):
  dct1,dct2= train_df.new_inp[i], train_df.new_out[i]
  # print (dct1,dct2)
  # print (dict(dct1.items() | dct2.items()))
  dct = (dict(dct1.items() | dct2.items()))
  new_dct1.append(dct)

new_dct2= []
for i in range(len(list(zip(test_df.new_inp, test_df.new_out)))):
  dct1,dct2= test_df.new_inp[i], test_df.new_out[i]
  # print (dct1,dct2)
  # print (dict(dct1.items() | dct2.items()))
  dct = (dict(dct1.items() | dct2.items()))
  new_dct2.append(dct)

In [17]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-ma

In [18]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

data_dict = {'train':Dataset.from_dict({'translation':new_dct1}),
     'validation':Dataset.from_dict({'translation':new_dct2}),
     }

raw_data = DatasetDict(data_dict)

In [19]:
new_dct1[1]

{'input': 'Cari, drunk, tattoo, ink, cute <st0> Regret <st1>',
 'ouput': "Cari was on vacation in Mexico with her friends. She was a little drunk, but not noticeably, when they had an idea. They all decided to get matching tattoos of each others' initials! When Cari woke in the morning, she regretted getting inked so hastily. But, she consoled herself, at least the tattoo was small - and cute!"}

In [20]:
### Specifying input and output length

max_input_length = 256
max_target_length = 256

In [21]:
### Loading T5 Base Model tokenizer and model

model_t5 = "t5-base"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= False ,max_length= 300)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [22]:
## Pre-processing the raw data to create labels and attention_mask


def preprocess_function(examples):

  inputs = [ex['input'] for ex in examples["translation"]]
  targets = [ex['ouput'] for ex in examples["translation"]]

  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
  # Setup the tokenizer for targets

  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=max_target_length, truncation=True)
  model_inputs["labels"] = labels["input_ids"]

  return model_inputs


tokenized_datasets = raw_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/42132 [00:00<?, ? examples/s]



Map:   0%|          | 0/10533 [00:00<?, ? examples/s]

In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 42132
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10533
    })
})

In [24]:
### Selecting the dataset training and validation examples to train model

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(40000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(4000))

In [25]:
model_name = 't5-base-stories40k'

In [26]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= False, max_length= 300)

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [27]:
batch_size = 16
# model_name = model.split("/")[-1]
model_name = model_name
args = Seq2SeqTrainingArguments(
   f"{model_name}",
   evaluation_strategy = "epoch",
   learning_rate = 2e-5,
   per_device_train_batch_size = batch_size,
   per_device_eval_batch_size = batch_size,
   weight_decay=0.01,
   save_total_limit=1,
   num_train_epochs= 10 ,
  #  overwrite_output_dir = '/content/drive/MyDrive/checkpoints'
)



In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
   model,
   args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer
)
  
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model('t5-base-stories40k')

Testing Model [validation]

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

l= [['David, habbit, eat, diet, week <st0> David drops the weight <st1>'],
    ["Tom, guest, punch, quickly, couch <st0> Frustration <st1>"],
    ['shack, garden, supply, build, gardening <st0> Shack <st1>'],
    ['paster, sermin, sing, music, congregation <st0> Overcoming shortcomings <st1>']]
for i in l:
  src_text = i
  tokenizer = AutoTokenizer.from_pretrained('t5-base-stories40k/checkpoint-37500')
  model = AutoModelForSeq2SeqLM

In [None]:
#### Model Output

['David is a very overweight man. He has a bad habit of eating unhealthy foods. David decides to stop eating unhealthy foods. David goes on a diet to lose weight. David loses the weight in a week.']
['Tom was at a party. He was seated next to a guest. The guest punched Tom. Tom quickly walked away. The guest sat on the couch.']
['I built a shack for my friend. I wanted to have a garden. I bought all the supplies. I built it. I had a great gardening shack.']
['The paster was a young man. He was a sermin. He sang a song. The music was good. The congregation was happy.']



Model Iteration records

In [None]:
################################################# 1st Run ############################################3
# 1st
# model_t5 = "t5-small"
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= True,
#       max_length= 300, num_beams= 4)
#2nd
# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= True, max_length= 300, num_beams= 4)
# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: ['Joe wore a nice costume. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it.']


##################################################3 2nd Run ###########################################333
# 1st
# model_t5 = "t5-small"
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= True, max_length= 300)
#2nd
# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= True, max_length= 300)

# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: ['Joe bought a costume from Joe. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. 
  # Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe bought it.']

  #############################################33333 3RD RUN ##########################

# 1st
# model_t5 = "t5-small"
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= False,
#       max_length= 300)
#2nd
# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= False, max_length= 300)

# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: ['Joe bought a costume. Joe bought it. Joe bought it. Joe bought it. Joe bought it. Joe was a nice party party. Joe bought it.']

#################################################### 4th RUN #########################################333

# 1st
# model_t5 = "t5-base"
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= False,
#       max_length= 300)
#2nd
# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= False, max_length= 300)

# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: ['Joe was going to a party. He wanted to wear a costume. He was so nice. He bought a nice one. He was so excited to go.']

#################################################### 5th RUN #########################################333


# 1st
# model_t5 = "t5-base"
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= True,
#       max_length= 300)
#2nd
# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= True, max_length= 300)

# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: ['Joe was going to a party. He wanted to wear a costume. He was so nice. He bought a nice one. He was so happy.']

################################################## 6th run ##############################

# model_t5 = "t5-large"
# tokenizer = AutoTokenizer.from_pretrained(model_t5,use_fast=False, early_stopping= True,
# model = AutoModelForSeq2SeqLM.from_pretrained(model_t5, early_stopping= True, max_length= 300)
# Input: ['costume, Joe, nice, buy, party <st0> Costume <st1>']
# Output: 
