In [None]:
# import and load data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import re
import pickle
import email
from tqdm import tqdm
import datetime
from dateutil import parser
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split


!pip install -q gpt-2-simple
import gpt_2_simple as gpt2

import re
!pip install --upgrade --no-cache-dir gdown

!pip install simplet5
from simplet5 import SimpleT5

from transformers import (
    T5ForConditionalGeneration,
    MT5ForConditionalGeneration,
    ByT5Tokenizer,
    PreTrainedTokenizer,
    T5TokenizerFast as T5Tokenizer,
    MT5TokenizerFast as MT5Tokenizer,
)

### Data Preparation to feed in to GPT-2

In [None]:
#load gpt data
!gdown --id 1IvheqVMHPsYoA7o__azk0iedOYmt0ABi
with open('gpt_data.pickle','rb') as file:
    train_data_gpt,test_data_gpt = pickle.load(file)

Downloading...
From: https://drive.google.com/uc?id=1IvheqVMHPsYoA7o__azk0iedOYmt0ABi
To: /content/gpt_data.pickle
100% 8.70M/8.70M [00:00<00:00, 83.4MB/s]


In [None]:
train_data_gpt.head(10)

Unnamed: 0,Body
0,I take back my dog comment john
1,Please take a look at it You may find it usefu...
2,fyi sg
3,Taylor of ENA sent his erequest in last Thursd...
4,FYI anyone that deals with CAP please read the...
5,I not aware of the volume rate info you used i...
6,will be back today
7,can you please work up an offer for for Do not...
8,come on you are kidding right
9,I received the revised opinion from this morni...


In [None]:
train_data_gpt.to_csv('train_data_gpt.csv',index=False) # we need in file.csv format for gpt2 model

### Data Preparation to feed in to T5    
Load the dataset with Enc_seq and Dec_seq columns which will be also be useful in **Evaluation of GPT-mode**l 

In [2]:
# !gdown --id 1cvJp9HTZ5z6FvMl5Q7bCenWVbtqgzYCa
with open('Sequence_data.pickle', 'rb') as file:
    train_sequences,test_sequences = pickle.load(file)

In [3]:
train_sequences.head()

Unnamed: 0,enc_seq,dec_seq
0,I take back my dog,comment john
1,I take back my dog comment,john
2,Please take a look at,it You may find it useful Vince
3,Please take a look at it,You may find it useful Vince
4,Please take a look at it You,may find it useful Vince


### GPT-2 Modelling (Base-124 Million params)

In [None]:
gpt2.mount_gdrive()
gpt2.download_gpt2(model_name="124M")

Mounted at /content/drive


Fetching checkpoint: 1.05Mit [00:00, 402Mit/s]                                                      
Fetching encoder.json: 1.05Mit [00:00, 2.96Mit/s]
Fetching hparams.json: 1.05Mit [00:00, 533Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:18, 26.9Mit/s]                                  
Fetching model.ckpt.index: 1.05Mit [00:00, 528Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:00, 3.59Mit/s]
Fetching vocab.bpe: 1.05Mit [00:00, 3.72Mit/s]


In [None]:
# IF data is big, we may wanna encode the data first for GPT model

# gpt2.encode_csv('train_data_gpt.csv',out_path='csv_encoded.txt')  # add EOS AND BOS Tokens
# gpt2.encode_dataset('train_data_gpt.txt')  # encode data to numpy npz

# LOAD THE Partially-trained model
gpt2.copy_checkpoint_from_gdrive(run_name='run1')

In [None]:

sess = gpt2.start_tf_sess()

file_name = 'train_data_gpt.csv'

# # for faster dataloading in model uncomment below
# file_name = 'text_encoded.npz'

gpt2.finetune(sess,
              batch_size=2, # total of 2048 samples per batch
              dataset=file_name,
              model_name='124M',
              steps=2500,  # training-steps 
              restore_from='latest', # finetune from latest finetuned GPT-2
              overwrite=True,
              run_name='run1',
              sample_length=30,
              print_every=5,  # how much delayed iters-loss to print
              sample_every=200,  # Number of training-steps AFTER WHICH to print output examples
              save_every=500    # training-steps after which to save model
              )

Loading checkpoint models/124M/model.ckpt
INFO:tensorflow:Restoring parameters from models/124M/model.ckpt
Loading dataset...


100%|██████████| 1/1 [00:00<00:00,  5.71it/s]


dataset has 2675731 tokens
Training...
Saving checkpoint/run1/model-0
[5 | 27.13] loss=2.50 avg=2.50
[10 | 48.17] loss=2.23 avg=2.37
[15 | 69.64] loss=2.20 avg=2.31
[20 | 91.56] loss=2.16 avg=2.27
[25 | 113.99] loss=2.11 avg=2.24
[30 | 136.86] loss=1.97 avg=2.19
[35 | 159.51] loss=2.01 avg=2.17
[40 | 182.02] loss=1.88 avg=2.13
[45 | 204.54] loss=2.00 avg=2.11
[50 | 227.20] loss=1.94 avg=2.10
[55 | 249.87] loss=1.86 avg=2.07
[60 | 272.49] loss=2.06 avg=2.07
[65 | 295.08] loss=1.98 avg=2.07
[70 | 317.65] loss=1.89 avg=2.05
[75 | 340.28] loss=1.86 avg=2.04
[80 | 362.94] loss=1.79 avg=2.02
[85 | 385.64] loss=1.77 avg=2.01
[90 | 408.30] loss=1.92 avg=2.00
[95 | 430.94] loss=1.82 avg=1.99
[100 | 453.56] loss=1.89 avg=1.99
[105 | 476.16] loss=1.82 avg=1.98
[110 | 498.80] loss=1.80 avg=1.97
[115 | 521.45] loss=1.72 avg=1.96
[120 | 544.14] loss=1.80 avg=1.95
[125 | 566.82] loss=1.78 avg=1.94
[130 | 589.49] loss=1.80 avg=1.93
[135 | 612.17] loss=1.78 avg=1.93
[140 | 634.81] loss=1.72 avg=1.92
[1

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='run1')

### GPT-2 Inferencing

In [4]:
gpt2.mount_gdrive()
gpt2.copy_checkpoint_from_gdrive(run_name='run1')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, run_name='run1')

Loading checkpoint checkpoint/run1/model-2500
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-2500


In [31]:
# tf.keras.backend.clear_session()

In [6]:
def predict_sent(s,l=30):
    prefix="<|startoftext|> "+s
    p = gpt2.generate(sess,
                prefix=prefix,
                truncate="<|endoftext|>",
                length=l,
                run_name='run1',
                temperature=0.7,
                include_prefix=True,    # this isn't working (and is an open issue on the library's github), hence we truncate manually
                return_as_list=True
                )[0]
                
    p = p[len(prefix):]
    return p.strip()

In [None]:

# BLEU Score on data; we take just 100 samples as running GPT-2 is expensive 

# TRAIN DATA
MAX_LENGTH = 30
import nltk.translate.bleu_score as bleu
sample = train_sequences.sample(100,ignore_index=True,replace=False,random_state=0)
reference_inp = sample.enc_seq.str.lower().values.tolist()
reference_tar = sample.dec_seq.str.lower().values.tolist()
reference_tar = [[ele.strip().split()] for ele in reference_tar]  # changing to the format 'corpus_bleu' takes in
prediction = []
ct = 0
for sent in tqdm(reference_inp):
    # To avoid ram-usage-overflow issue 
    if ct%50==0:
        tf.keras.backend.clear_session()
        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess, run_name='run1')

    p = predict_sent(sent)
    prediction.append(p.lower().split())
    ct += 1

# sf = bleu.SmoothingFunction()  ,smoothing_function=sf.method1
print('\nThe Final BLEU score over 100 random train-samples is: {}'.format(bleu.corpus_bleu(reference_tar, prediction)))

tf.keras.backend.clear_session()

# TEST DATA
MAX_LENGTH = 30
import nltk.translate.bleu_score as bleu
sample = test_sequences.sample(100,ignore_index=True,replace=False,random_state=0)
reference_inp = sample.enc_seq.str.lower().values.tolist()
reference_tar = sample.dec_seq.str.lower().values.tolist()
reference_tar = [[ele.strip().split()] for ele in reference_tar]  # changing to the format 'corpus_bleu' takes in
prediction = []
ct = 0
for sent in tqdm(reference_inp):
    # To avoid ram-usage-overflow issue 
    if ct%50==0:
        tf.keras.backend.clear_session()
        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess, run_name='run1')

    p = predict_sent(sent)
    prediction.append(p.lower().split())
    ct += 1

# sf = bleu.SmoothingFunction()  ,smoothing_function=sf.method1
print('\nThe Final BLEU score over 100 random test-samples is: {}'.format(bleu.corpus_bleu(reference_tar, prediction)))


The Final BLEU score over 100 random train-samples is: 0.09694239354106254


  0%|          | 0/100 [00:00<?, ?it/s]

Loading checkpoint checkpoint/run1/model-2500
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-2500


 50%|█████     | 50/100 [08:42<14:35, 17.52s/it]

Loading checkpoint checkpoint/run1/model-2500
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-2500


100%|██████████| 100/100 [17:49<00:00, 10.70s/it]


The Final BLEU score over 100 random test-samples is: 0.17828866606183447





The Bleu-Score for GPT-2 (small) on a sample of *100* test-sentences after *2500* training-steps is **0.18** with categorical loss of **0.77**

In [None]:
# predict for sample sentences
def predict_samples(data,k=10):
    for _ in range(k):
        idx = np.random.choice(data.shape[0])
        input_sent = data.iloc[idx].enc_seq
        target_sent = data.iloc[idx].dec_seq
        print("Input-Sentence:\n",input_sent)
        print('='*130)
        print("Target-Sentence:\n",target_sent)
        print('='*130)
        p = predict_sent(input_sent)
        print("Predicted-Sentence:\n",p)
        print('x'*130)

predict_samples(train_sequences)
print()
print('-*'*130)
predict_samples(test_sequences)

Input-Sentence:
 Should not you and Rick
Target-Sentence:
 do that together so as not to have confusion
Predicted-Sentence:
 do it together as well
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Input-Sentence:
 I have confirmed the Vendor Number in Global Counterparty is we are still waiting for the
Target-Sentence:
 SAP Customer Job to complete
Predicted-Sentence:
 SAP Customer Job to complete
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Input-Sentence:
 HourAhead No ancillary schedules awarded No
Target-Sentence:
 variances detected LOG PARSING FILE PortlandWestDeskCalifornia SchedulingISO Final Schedules txt
Predicted-Sentence:
 variances detected LOG PARSING FILE PortlandWestDeskCalifornia SchedulingISO Final Schedules txt
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Latency-check-GPT

In [7]:
import time
def get_latency():
    t = []
    sample = test_sequences.sample(60,ignore_index=True,replace=False,random_state=0) # as more than 60 crashes the ram
    reference_inp = sample.enc_seq.str.lower().values.tolist()
    ct = 0
    for sent in tqdm(reference_inp):
        a = time.time()
        p = predict_sent(sent)
        b = time.time()
        ms = (b-a) * 1000 # time in milliseconds
        t.append(ms)
        ct += 1

    return t

In [8]:
t = get_latency()
np.mean(t),np.percentile(t,90),np.percentile(t,99)

100%|██████████| 60/60 [11:53<00:00, 11.88s/it]


(11882.10867245992, 19693.22018623352, 21115.62076807022)

### T5 text-to-text transformer Modelling (Base-220 Million params)

Training T5 transformer is very-hard and expensive, we use Kaggle kernels to finetune it hence no logs are available

In [None]:
# slight changes for data
train_sequences = train_sequences.rename(columns={"dec_seq":"target_text", "enc_seq":"source_text"})
test_sequences = test_sequences.rename(columns={"dec_seq":"target_text", "enc_seq":"source_text"})

print(train_sequences.shape,test_sequences.shape)

(735089, 2) (183306, 2)


In [None]:
# model = SimpleT5()
# model.from_pretrained(model_type="t5", model_name="t5-base")

# model.train(train_df=X_train,
#             eval_df=X_test, 
#             source_max_token_len=50, 
#             target_max_token_len=50, 
#             max_epochs=2, use_gpu=True,
#             batch_size=64)

Trained on Kaggle hence no logs are available
* The model is finetuned for 2 full epochs, after which it started *severly overfitting*.
* Final **train-loss = 1.95** & **val-loss = 2.47**.

### T5 Inference

In [4]:
# load the finetuned model 
import gdown
link = "https://drive.google.com/drive/folders/1NDyWGYICKcLNJMIZ0KJ5rYxLbcb_jJZD?usp=sharing"
gdown.download_folder(link,quiet=True)

model = SimpleT5()
model.load_model("t5","./outputs/simplet5-epoch-1-train-loss-1.954-val-loss-2.4739")

In [5]:
def predict_sent_t5(s,l=50):
    p = model.predict(s,max_length=l,num_beams=4,length_penalty=1.5)   # length penalty is set as T5 predictions are short      
    p = p[0].split()    
    return p

In [None]:
# calculate bleu score
import nltk.translate.bleu_score as bleu


#                                   TRAIN DATA BLEU SCORE
sample = train_sequences.sample(100,ignore_index=True,replace=False,random_state=0)
reference_inp = sample.source_text.str.lower().values.tolist()

reference_tar = sample.target_text.str.lower().values.tolist()
reference_tar = [[ele.strip().split()] for ele in reference_tar] 
prediction = []

for sent in tqdm(reference_inp):
    p = predict_sent_t5(sent)
    prediction.append(p)

print('\nThe Final BLEU score over 100 random Trining-samples is: {}'.format(bleu.corpus_bleu(reference_tar, prediction)))



#                                   TEST DATA BLEU SCORE


sample = test_sequences.sample(100,ignore_index=True,replace=False,random_state=0)
reference_inp = sample.source_text.str.lower().values.tolist()

reference_tar = sample.target_text.str.lower().values.tolist()
reference_tar = [[ele.strip().split()] for ele in reference_tar] 
prediction = []

for sent in tqdm(reference_inp):
    p = predict_sent_t5(sent)
    prediction.append(p)

print('\nThe Final BLEU score over 100 random Test-samples is: {}'.format(bleu.corpus_bleu(reference_tar, prediction)))

100%|██████████| 100/100 [01:23<00:00,  1.20it/s]



The Final BLEU score over 100 random Trining-samples is: 0.11110997078057672


100%|██████████| 100/100 [01:24<00:00,  1.19it/s]


The Final BLEU score over 100 random Test-samples is: 0.07522400762797224





In [None]:
def predict_samples(data,k=10):
    '''
    Predict a few sample outputs for T5 Model
    '''
    for _ in range(k):
        idx = np.random.choice(data.shape[0])
        input_sent = data.iloc[idx].source_text
        target_sent = data.iloc[idx].target_text
        print("Input-Sentence:\n",input_sent)
        print('='*130)
        print("Target-Sentence:\n",target_sent)
        print('='*130)
        p = ' '.join(predict_sent_t5(input_sent))
        print("Predicted-Sentence:\n",p)
        print('x'*130)

predict_samples(train_sequences)
print()
print('-*'*130)
predict_samples(test_sequences)

Input-Sentence:
 Susan Can you please set
Target-Sentence:
 up the attached book for the estate
Predicted-Sentence:
 up a call with to discuss
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Input-Sentence:
 The following expense report is ready for L Kimberly Status last changed
Target-Sentence:
 February CR Amount Due To approve this expense report click on the following link for Concur
Predicted-Sentence:
 To approve this expense report click on the following link for Concur
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Input-Sentence:
 just wanted to let you know i got back from cayman yesterday and it was low hi
Target-Sentence:
 it is going to rock
Predicted-Sentence:
 how was your workout
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Latency-check-T5

In [8]:
import time
def get_latency():
    t = []
    sample = test_sequences.sample(100,ignore_index=True,replace=False,random_state=0) # as more than 60 crashes the ram
    reference_inp = sample.enc_seq.str.lower().values.tolist()
    ct = 0
    for sent in tqdm(reference_inp):
        a = time.time()
        p = predict_sent_t5(sent)
        b = time.time()
        ms = (b-a) * 1000 # time in milliseconds
        t.append(ms)
        ct += 1

    return t

In [9]:
t = get_latency()
np.mean(t),np.percentile(t,90),np.percentile(t,99)

100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


(882.7659273147583, 1364.3655061721804, 2461.0164952278215)

# END