In [1]:
# install
!pip install pytorch-pretrained-bert pytorch-nlp



In [2]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline



In [3]:
# specify GPU device
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#load the dataset from google drive  
import pandas as pd
df1=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/doc-chatbot/Emotion Phrases.csv',header=None)
df2=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/doc-chatbot/text_emotion.csv')


In [7]:
#assign the column names to dataset
df1.columns=['emotion','phrase']
#rename the colnames of df2
df2=df2.rename(columns={"sentiment": "emotion", "content": "phrase"})
#select only 2 columns from df2
df2=df2[['emotion','phrase']]
happy_df=df2[df2['emotion']=='happiness']

#concat the df1 and df2
frames = [df1, happy_df]

df = pd.concat(frames)
df.head()




Unnamed: 0,emotion,phrase
0,joy,[ On days when I feel close to my partner and ...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [8]:
#map the lables to numbers and store it a dictionary
#possible_labels = df.sentiment.unique()
possible_labels = df.emotion.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'anger': 2,
 'disgust': 4,
 'fear': 1,
 'guilt': 6,
 'happiness': 7,
 'joy': 0,
 'sadness': 3,
 'shame': 5}

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
nltk.download('stopwords')
nltk.download('wordnet')
stop=stopwords.words('english')
lem=WordNetLemmatizer()
#perform preprocessing
def clean(phrase):
  tokens=gensim.utils.simple_preprocess(phrase)

  #clean_tokens=[token for token in tokens if token not in stop]
  lemmed_tokens=[lem.lemmatize(word) for word in tokens]
  sentence=' '.join(lemmed_tokens)

  
  return sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:

df['phrase']=df['phrase'].apply(lambda x:clean(x))
df.head()


Unnamed: 0,emotion,phrase
0,joy,on day when feel close to my partner and other...
1,fear,every time imagine that someone love or could ...
2,anger,when had been obviously unjustly treated and h...
3,sadness,when think about the short time that we live a...
4,disgust,at gathering found myself involuntarily sittin...


In [11]:

#replace the categorical labels with numarical
df['emotion'] = df.emotion.replace(label_dict)


In [12]:
train_sample=df.sample(frac=.65)
test_sample=df.sample(frac=.10)

In [13]:
phrase=train_sample.phrase.values
emotion=train_sample.emotion.values

In [14]:

# add special tokens for BERT to work properly
sentences = ["[CLS] " + sen + " [SEP]" for sen in phrase]
print(sentences[0])

# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

[CLS] gi_ri_ja good morning it nice to see you [SEP]
Tokenize the first sentence:
['[CLS]', 'gi', '_', 'ri', '_', 'ja', 'good', 'morning', 'it', 'nice', 'to', 'see', 'you', '[SEP]']


In [15]:
# Set the maximum sequence length. 
MAX_LEN = 128
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


In [16]:
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [17]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, emotion, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
                                             
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. 
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [18]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [19]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [20]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=3e-5,
                     warmup=.05)

t_total value of -1 results in schedule not being applied


In [21]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:

#training 
  
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = 4

# BERT training loop
for _ in trange(epochs, desc="Epoch"):  
  
  ## TRAINING
  
  # Set our model to training mode
  model.train()  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    b_input_ids = b_input_ids.type(torch.LongTensor)
    b_input_mask = b_input_mask.type(torch.LongTensor)
    #convert int to long datatype
    b_labels = b_labels.type(torch.LongTensor)

    b_input_ids = b_input_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    b_labels = b_labels.to(device)
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))
       

    

 
       
  ## VALIDATION

  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))



	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Train loss: 0.8945661889294446


Epoch:  25%|██▌       | 1/4 [03:29<10:27, 209.31s/it]

Validation Accuracy: 0.78125
Train loss: 0.46914547561841496


Epoch:  50%|█████     | 2/4 [07:04<07:02, 211.14s/it]

Validation Accuracy: 0.8090277777777778
Train loss: 0.2479218224584425


Epoch:  75%|███████▌  | 3/4 [10:39<03:32, 212.35s/it]

Validation Accuracy: 0.8125
Train loss: 0.12937303596191993


Epoch: 100%|██████████| 4/4 [14:14<00:00, 213.74s/it]

Validation Accuracy: 0.8148148148148148





In [23]:
review=test_sample.phrase.values
sentiment=test_sample.emotion.values

In [24]:

sentences = ["[CLS] " + query + " [SEP]" for query in review]
labels = sentiment

# tokenize test data
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
MAX_LEN = 128
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# create test tensors
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
batch_size = 16
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Prediction on test set
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  #convert to long from int dtype
  b_input_ids = b_input_ids.type(torch.LongTensor)
  b_input_mask = b_input_mask.type(torch.LongTensor)
  b_labels = b_labels.type(torch.LongTensor)

  b_input_ids = b_input_ids.to(device)
  b_input_mask = b_input_mask.to(device)
  b_labels = b_labels.to(device)
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
  
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []
for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)
  
# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print('Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format(matthews_corrcoef(flat_true_labels, flat_predictions)))

Classification accuracy using BERT Fine Tuning: 87.47%


prediction on user input


In [25]:

import pandas as pd
# Load the dataset into a pandas dataframe.
input_text=input('Enter the text: ')
new_sentence=pd.Series(input_text)

# Report the number of sentences.
#print('Number of test sentences: {:,}\n'.format(df.shape[0]))
# Create sentence and label lists
sentences = new_sentence.values
# tokenize test data
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
MAX_LEN = 128
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# create test tensors
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

batch_size = 16
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# load the model from disk



# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions = []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  b_input_ids = b_input_ids.type(torch.LongTensor)
  b_input_mask = b_input_mask.type(torch.LongTensor)
        

  b_input_ids = b_input_ids.to(device)
  b_input_mask = b_input_mask.to(device)
        
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  #label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  #true_labels.append(label_ids)
print('emotions.')
import torch
import torch.nn.functional as F 
#caluclate the softmax for logits from bert model
#becasue we have 7 different calsses otherwise we use sigmoid for binary classification
softmax_val=F.softmax(torch.tensor(predictions[0])).tolist()
#print(softmax_val)
#sorting the list of softmax values to get max 2 probabilities
sorted_integers = sorted(softmax_val, reverse=True)  

largest_prob = sorted_integers[0]  
second_largest_prob = sorted_integers[1]  

#Here we are extracting the max probability value index from the list and get the key(label) from label dictionary

#max_value = max((softmax_val))
max_index1 = softmax_val.index(largest_prob)
max_index2 = softmax_val.index(second_largest_prob)
key_list = list(label_dict.keys()) 
val_list = list(label_dict.values()) 
  
print(key_list[val_list.index(max_index1)]) 


Enter the text: helping poor giving me lots of satisfaction than partying with rich friends
emotions.
shame




In [26]:
stoping

NameError: ignored

In [None]:
import pickle
#bert_tokenizer='bert_tokenizer.pkl'
#padding='pad_seqience.pkl'
#tensor_data='tensor_data.pkl'
#sampler='sampler.pkl'
#loader='loader.pkl'
#bert_model = 'bert_model.pkl'

#pickle.dump(tokenizer, open(bert_tokenizer, 'wb'))
#pickle.dump(pad_sequences, open(padding, 'wb'))
#pickle.dump(TensorDataset, open(tensor_data, 'wb'))
#pickle.dump(SequentialSampler, open(sampler, 'wb'))
#pickle.dump(DataLoader, open(loader, 'wb'))
#pickle.dump(model, open(bert_model, 'wb'))

In [None]:
import torch
model = model
torch.save(model.state_dict(), "bert_model_90.pt")