In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
if torch.cuda.is_available():    

       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
 
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1050


In [4]:
!pip install transformers




In [5]:
import pandas as pd
import numpy as np

train=pd.read_csv("../dataset/train.csv")

In [6]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

In [7]:
aug = naw.RandomWordAug(action="swap")
df = pd.DataFrame(columns=['text','intent'])
for i,row in train.iterrows():
  augmented_text = aug.augment(row['text'])
  df = df.append({'text': augmented_text,'intent': row['intent']}, ignore_index=True)


In [8]:
aug = naw.SynonymAug(aug_src="wordnet")
df1 = pd.DataFrame(columns=['text','intent'])
for i,row in train.iterrows():
  augmented_text = aug.augment(row['text'])
  df1 = df1.append({'text': augmented_text,'intent': row['intent']}, ignore_index=True)
train = train.append(df).reset_index(drop=True)
train = train.append(df1).reset_index(drop=True)

train = train.append(train).reset_index(drop=True)
train = train.append(train).reset_index(drop=True)
train.shape

sentences = train.text.values
labels = train.intent.values

In [9]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

print(' Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(sentences[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

max_len = 0

 
for sent in sentences:
 
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
 
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

input_labels = []
 
k = 1;
mp = {}
mpp= {}
for sent in labels:
    mp[sent]=0;
for sent in labels:
    if(mp[sent] == 0):
        print(sent)
        mp[sent] = k
        mpp[k]=sent
        k = k + 1
    input_labels.append(mp[sent]-1)
print('Original: ', labels[0])
print('Token IDs:', input_labels[0])
print('Original: ', labels[1])
print('Token IDs:', input_labels[1])
print('Original: ', labels[2])
print('Token IDs:', input_labels[2])
print('Original: ', labels[3])
print('Token IDs:', input_labels[3])
print(k)

Loading BERT tokenizer...
 Original:   i want to take a  from delhi at 838 am and arrive in mumbai at 1110 in the morning
Tokenized:  ['i', 'want', 'to', 'take', 'a', 'from', 'delhi', 'at', '83', '##8', 'am', 'and', 'arrive', 'in', 'mumbai', 'at', '111', '##0', 'in', 'the', 'morning']
Token IDs:  [1045, 2215, 2000, 2202, 1037, 2013, 6768, 2012, 6640, 2620, 2572, 1998, 7180, 1999, 8955, 2012, 11118, 2692, 1999, 1996, 2851]
Max sentence length:  25
TrainAvailable
TrainFare
GetDistance
TrainRoute
Original:  TrainAvailable
Token IDs: 0
Original:  TrainAvailable
Token IDs: 0
Original:  TrainAvailable
Token IDs: 0
Original:  TrainAvailable
Token IDs: 0
5


In [10]:
input_ids = []
attention_masks = []
 
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True,  
                        max_length = 64,            
                        pad_to_max_length = True,
                        return_attention_mask = True,    
                        return_tensors = 'pt',      
                        truncation = True,
                   )
    
   
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(input_labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

"""# Split dataset in Train and Validation"""

from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,   
            sampler = RandomSampler(train_dataset),  
            batch_size = batch_size  
        )
 
validation_dataloader = DataLoader(
            val_dataset,  
            sampler = SequentialSampler(val_dataset),  
            batch_size = batch_size 
        )
 

from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  
    num_labels = k-1,    
    output_attentions = False,  
    output_hidden_states = False,
)

model.cuda()

params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,  
                  eps = 1e-8  
                )

from transformers import get_linear_schedule_with_warmup
 
epochs = 4
 
total_steps = len(train_dataloader) * epochs
 
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,  
                                            num_training_steps = total_steps)

import numpy as np
 
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    ''' 
    elapsed_rounded = int(round((elapsed)))
     
    return str(datetime.timedelta(seconds=elapsed_rounded))


 

Original:   i want to take a  from delhi at 838 am and arrive in mumbai at 1110 in the morning
Token IDs: tensor([  101,  1045,  2215,  2000,  2202,  1037,  2013,  6768,  2012,  6640,
         2620,  2572,  1998,  7180,  1999,  8955,  2012, 11118,  2692,  1999,
         1996,  2851,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
  302 training samples
   34 validation samples
The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                       

In [11]:
"""# Training Model"""

import random
import numpy as np
device = torch.device("cuda")  
seed_val = 42
 
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
 
training_stats = []
  
total_t0 = time.time()
  
for epoch_i in range(0, epochs):
    
   
 
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
  
    t0 = time.time()
  
    total_train_loss = 0
 
    model.train()
  
    for step, batch in enumerate(train_dataloader):
  
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
 
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
 
        model.zero_grad()        
 
       
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
 
     
        total_train_loss += loss.item()
 
        loss.backward()
 
       
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
        optimizer.step()
 
        scheduler.step()
 
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)
 
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
   
 
    print("")
    print("Running Validation...")
 
    t0 = time.time()
 
    model.eval()
 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
 
    for batch in validation_dataloader:
        
     
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        
        with torch.no_grad():        
 
          
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
         
        total_eval_loss += loss.item()
 
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
 
       
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
 
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
 
    avg_val_loss = total_eval_loss / len(validation_dataloader)
  
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
 
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
 
print("")
print("Training complete!")
 
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




Training...

  Average training loss: 1.16
  Training epcoh took: 0:00:13

Running Validation...
  Accuracy: 0.69
  Validation Loss: 0.81
  Validation took: 0:00:00

Training...

  Average training loss: 0.74
  Training epcoh took: 0:00:13

Running Validation...
  Accuracy: 1.00
  Validation Loss: 0.54
  Validation took: 0:00:00

Training...

  Average training loss: 0.51
  Training epcoh took: 0:00:13

Running Validation...
  Accuracy: 1.00
  Validation Loss: 0.34
  Validation took: 0:00:00

Training...

  Average training loss: 0.36
  Training epcoh took: 0:00:13

Running Validation...
  Accuracy: 1.00
  Validation Loss: 0.27
  Validation took: 0:00:00

Training complete!
Total training took 0:00:54 (h:mm:ss)


In [12]:
# """# Prediction"""

sentences = [
  "show me some train from mumbai to delhi",
  "route of train from delhi to mumbai",
]
print(sentences)

input_ids = []
attention_masks = []

for sent in sentences:
   
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 64,          
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt', 
                        truncation=True,    
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
 

batch_size = 32  

prediction_data = TensorDataset(input_ids, attention_masks )
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
 
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask  = batch
  
  with torch.no_grad():
      
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  
  predictions.append(logits)


print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

 
print(mpp[flat_predictions[0]+1])
print(mpp[flat_predictions[1]+1])


['show me some train from mumbai to delhi', 'route of train from delhi to mumbai']
Predicting labels for 2 test sentences...
    DONE.
TrainAvailable
TrainRoute


In [None]:
import os

output_dir = '../output/intent/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)


model_to_save = model.module if hasattr(model, 'module') else model  
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
 
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
 
model.to(device)
 

Saving model to ../output/intent/
