In [None]:
import pandas as pd
data = pd.read_csv('C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\preprocessed_emails.csv')


In [None]:
data = data["body"]

In [None]:
#the Enron emails datase is originally 880 MB and my RAM didn't support this much data, so I had to reduce the size of 
#data
reduced_data = data.sample(frac=0.001)
reduced_data.shape

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(reduced_data, test_size=0.2,shuffle=True)

In [None]:
train.to_csv('C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\train.csv',index=False)
test.to_csv('C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\test.csv',index=False)

In [None]:
import pandas as pd
train = pd.read_csv('C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\train.csv')
test = pd.read_csv('C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\test.csv')

In [None]:
train =train.squeeze()
test = test.squeeze()
train.head()

In [None]:
train = train.to_numpy()
test = test.to_numpy()

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim

In [34]:
#the main vocabulary class job : creates a token to index and a index to token dictionary. we can access to one of each, given the other
class Vocabulary(object):
  def __init__(self,data=None,unk_token = "<UNK>",mask_token="<MASK>", begin_seq_token="<BEGIN>",end_seq_token="<END>"):
    self.token_to_index = {}
    self.index_to_token = {}

    self.unk_token = unk_token
    self.add_token(unk_token)
    self.unk_index = self.token_to_index[unk_token]

    self.mask_token = mask_token
    self.add_token(mask_token)
    self.mask_index = self.token_to_index[mask_token]

    self.begin_seq_token = begin_seq_token
    self.add_token(begin_seq_token)
    self.begin_seq_index = self.token_to_index[begin_seq_token]

    self.end_seq_token = end_seq_token
    self.add_token(end_seq_token)
    self.end_seq_index = self.token_to_index[end_seq_token]

    if data is not None :
      for row in data :
        for token in word_tokenize(row):
          self.add_token(token)

  #this function gets one token and add it to the dictionary by updating both token_to_index and index_to_token      
  def add_token(self,token):
    if token not in self.token_to_index :
      next_index_in_vocab = len(self.token_to_index)
      self.token_to_index[token] = next_index_in_vocab
      self.index_to_token[next_index_in_vocab] = token

  # this function search for a token and returns its corresponding token, if the token is not in the vocabulary and vocabulary supports 
  # unk tokens it returns the index of unk token, otherwise it raise an error 
  def lookup_token(self,token):

    return self.token_to_index.get(token,self.unk_index)

  # this function search for a index and returns its corresponding token
  def lookup_index(self,index):

    if index not in self.index_to_token:
        raise KeyError("the index (%d) is not in the Vocabulary" % index)
    return self.index_to_token[index]

  # returns the legth of the vocabulary
  def __len__(self):

    return len(self.token_to_index)

  def use_previous_token_to_index(self,token_to_index):
    self.token_to_index = token_to_index
    self.index_to_token = {idx: token for token, idx in self.token_to_index.items()}

In [35]:
#the main job of Vectorizer class: it is responsible for converting a text (sequence of tokens) to a vectorized version of it (sequence of indexes)
#so it can be used by neural network layers

class Vectorizer(object):
  def __init__(self,text):
    self.text_vocab = text
 
  #vector_length is usually the lentgh of the maximum text
  #although we have textes with different lengthes but we neet to put them in a fixed-size vector and fill the remaining of the vector with mask
  #tokens. 
  def vectorize(self,text,vector_length):

    indices = [self.text_vocab.begin_seq_index] 
    indices.extend(self.text_vocab.lookup_token(token) for token in word_tokenize(text))
    indices.append(self.text_vocab.end_seq_index)

    from_vector = np.empty(vector_length, dtype=np.int64)         
    from_indices = indices[:-1]
    from_vector[:len(from_indices)] = from_indices
    from_vector[len(from_indices):] = self.text_vocab.mask_index

    to_vector = np.empty(vector_length, dtype=np.int64)
    to_indices = indices[1:]
    to_vector[:len(to_indices)] = to_indices
    to_vector[len(to_indices):] = self.text_vocab.mask_index
    
    return from_vector, to_vector


In [36]:
#The main job of Dataset class : Dataset class inherits from Dataset class in pytorch and implements two essential funtions of it, __getitem()__
#and __len__(), this class in being used for getting the dataset rows (vectorized version) during training and testing 
class Dataset(Dataset):
  def __init__(self,body,vectorizer):
    self.body = body
    self.vectorizer = vectorizer
    measure_len = lambda text: len(word_tokenize(text))
    self._max_text_length = max(map(measure_len,body)) + 2



  def __getitem__(self,index) :
      text = self.body[index]
      from_vector, to_vector = self.vectorizer.vectorize(text, self._max_text_length)

      #we return the vectoized version of text as x, and the index of emotion as y 
      return {'x_data': from_vector,
              'y_target': to_vector
              }
  def __len__(self):
    return len(self.body)

In [37]:
class WordGenerationModel(nn.Module):
  def __init__(self, body_vocab_size,embedding_size,rnn_hidden_size,padding_idx=0,dropout_p=0.5):
    super(WordGenerationModel, self).__init__()
    # I created an embedding layer to convert each token to a an ambedded vector. Embedded vecores are being created and tuned during the
    # training process. embedding_dim is an arbitary size that we want to have for each embedded token.
    self.word_emb = nn.Embedding(num_embeddings=body_vocab_size,
                                     embedding_dim=embedding_size,
                                     padding_idx=padding_idx)
    #GRU is a sequential nerural network layer which generates outputs by using a sequense of inputes and its hidden layer. at each time step 
    #(seeing one token), it also updates its hidden layer. hidden_size is an arbitary output size for GRU layer
    self.rnn = nn.GRU(input_size=embedding_size,
                          hidden_size=rnn_hidden_size,
                          batch_first=True)

    self.fc1 = nn.Linear(in_features=rnn_hidden_size,
                      out_features=body_vocab_size)
    self.fc2 = nn.Linear(in_features=rnn_hidden_size,
                      out_features=body_vocab_size)
  
  def forward(self,x_in,apply_softmax=False):
        # we tranfer indexes to embedded vectores
        x_embedded = self.word_emb(x_in)
        # we use our rnn layer and get a sequence of outputs, each one corresponding to one time step
        y_out, _ = self.rnn(x_embedded)

        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc1(F.dropout(y_out, 0.5))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out

In [39]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

batch_size=32
learning_rate=0.001
num_epochs=10
seed=1337
embedding_size=100
rnn_hidden_size=32

In [40]:
if not torch.cuda.is_available():
    cuda = False
    device = torch.device("cpu")
else :
    cuda = True
    device = torch.device("cuda")
print("Using CUDA: {}, device : {}".format(cuda,device))
set_seed_everywhere(seed,cuda)

Using CUDA: False, device : cpu


In [None]:
from nltk.tokenize import word_tokenize

In [None]:
text_vacob = Vocabulary(train)

In [None]:
vectorizer = Vectorizer(text_vacob)

In [None]:
train_dataset = Dataset(train,vectorizer)

In [None]:
test_dataset = Dataset(test,vectorizer)

In [None]:
model = WordGenerationModel(body_vocab_size=len(text_vacob),embedding_size=embedding_size,rnn_hidden_size=rnn_hidden_size)

In [None]:
#y_pred's shape is (Batch size,Sequence size,feature size). feature size is a one hot vector with the size of vocabulary and determines
#which world in the vocabulary has been selected.
#y_true's shape is (Batch size,Sequence size) and in each cell of this matrix we have an integer indicating the index of the target
#word in the vocabulary.
#cross entropy loss function's input should be with shape : y_pred => matrix , y_true => 1d array. so we need to to convert 3d to 2d and 2d
#to 1d to be able to use this loss function

def normalize_sizes(y_pred, y_true):

    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100
#this is the loss which use normalize_size function before cross_entropy
def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)


In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=True)


In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True,drop_last=True)

In [None]:
# I ran this code for  5 hours with num_epochs = 10
model = model.to(device)
mask_index = vectorizer.text_vocab.mask_index
train_loss= []
train_acc = []
val_loss= []
val_acc = []
smalles_loss = float('inf')
for epoch in range(num_epochs):
  running_loss = 0.0
  running_acc = 0.0

  model.train()
  for batch_index, batch_dict in enumerate(train_dataloader):
    #zero the gradients
    optimizer.zero_grad()

    #compute the output
    y_pred = model(x_in=batch_dict['x_data'])

    #compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'],mask_index)

    #running_loss += (loss.item() - running_loss) / (batch_index + 1)

    #use loss to produce gradients
    loss.backward()

    #use optimizer to take gradient step
    optimizer.step()
    
    # compute the accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

    train_loss.append(running_loss)
    train_acc.append(running_acc)

    
      

In [None]:
running_loss = 0.
running_acc = 0.
model.eval()
mask_index = vectorizer.text_vocab.mask_index
for batch_index, batch_dict in enumerate(test_dataloader):
    
    with torch.no_grad():
        #compute the output
        y_pred = model(x_in=batch_dict['x_data'])

        #compute the loss
        loss = sequence_loss(y_pred, batch_dict['y_target'],mask_index)

        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)



## Saving the data for later

In [None]:
#saving the vocabulary into a json file
import json
vocabs = {'text_vocab': text_vacob.token_to_index}
with open("C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\vocabs.json", "w") as fp:
  json.dump(vocabs, fp)

In [None]:
#saving the torch model
torch.save(model,"C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\model2")

## Loading the model and vocabulary

In [29]:
#loading the vicanulary
import json
with open("C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\vocabs.json","r") as fp:
    vocabs = json.load(fp)

In [31]:
token_to_index = vocabs["text_vocab"]


In [41]:
text_vacob_loaded = Vocabulary()
text_vacob_loaded.use_previous_token_to_index(token_to_index)

In [43]:
vectorizer2 = Vectorizer(text_vacob_loaded)

In [44]:
model = torch.load("C:\\Users\\parvi\\Documents\\toturials\\NLP\\NLP Projects\\NLP-Projects\\data\\Enron\\model2")
model.eval()

WordGenerationModel(
  (word_emb): Embedding(15754, 100, padding_idx=0)
  (rnn): GRU(100, 32, batch_first=True)
  (fc1): Linear(in_features=32, out_features=15754, bias=True)
  (fc2): Linear(in_features=32, out_features=15754, bias=True)
)

## Make inferences

In [168]:
def guess_the_next_n_words(encoded_text ,model, n=5, temperature=1.0):
    indices = []
    for index in encoded_text:
        indices.append(torch.tensor( [index] , dtype=torch.int64).unsqueeze(dim=1))

    h_t = None
    for index in indices:
        x_emb_t = model.word_emb(index)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        
    last_word_index = len(indices)- 1    
    for time_step in range(n):
        #print(n)
        x_t = indices[last_word_index + time_step]
        x_emb_t = model.word_emb(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc1(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        #_, predicted = torch.max(probability_vector, 1)
        #indices.append(torch.tensor( [predicted] , dtype=torch.int64).unsqueeze(dim=1))
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    #print(indices)
    indices = torch.stack(indices).squeeze()
    
    return indices

def decode_samples(sampled_indices, vectorizer):


    vocab = vectorizer.text_vocab

    sentence = []
    for time_step in range(sampled_indices.shape[0]):
        sample_item = sampled_indices[time_step].item()
        if sample_item == vocab.begin_seq_index:
            continue
        elif sample_item == vocab.end_seq_index:
            break
        else:
            sentence.append(vocab.lookup_index(sample_item))

    return sentence
    

In [48]:
def encode_samples(text,vectorizer):
    indices = [vectorizer.text_vocab.begin_seq_index] 
    indices.extend(vectorizer.text_vocab.lookup_token(token) for token in word_tokenize(text))
    return indices

In [175]:
text = """Please see if Scott will be able to meet with some Howard finance professor's
on January 25, 2001.  The latest appointment in the evening will work if """
nunmber_guess_words = 2
encoded_text = encode_samples(text,vectorizer2)
encoded_text_extended = guess_the_next_n_words(encoded_text=encoded_text,model = model,n=nunmber_guess_words)
extended_text = " ".join(decode_samples(encoded_text_extended,vectorizer2))
extended_text

'Please see if Scott will be able to meet with some Howard finance <UNK> on January 25 , 2001 . The latest <UNK> in the evening will work if Alternatively with'

In [210]:
text = """Congratulations on your promotion """
nunmber_guess_words = 2
encoded_text = encode_samples(text,vectorizer2)
encoded_text_extended = guess_the_next_n_words(encoded_text=encoded_text,model = model,n=nunmber_guess_words)
extended_text = " ".join(decode_samples(encoded_text_extended,vectorizer2))
extended_text

'Congratulations on your promotion CEO rebuild'

In [196]:
text = """Could you please furnish me an electronic version of your resume?  I will
need this to pass to the HR Dept.The people in the Research Group that will be """
nunmber_guess_words = 2
encoded_text = encode_samples(text,vectorizer2)
encoded_text_extended = guess_the_next_n_words(encoded_text=encoded_text,model = model,n=nunmber_guess_words)
extended_text = " ".join(decode_samples(encoded_text_extended,vectorizer2))
extended_text

'Could you please <UNK> me an electronic version of your resume ? I will need this to pass to the HR <UNK> people in the Research Group that will be There .'