In [83]:
'''
    /*----------------------------- AUTHOR_DETAILS -------------
    |
    |   __Project Title__   = Developing a Gender Prediction System using Bidirectional LSTM-based Deep Neural Networks
    |
    |   __author__          = Anam Arif
    |
    *------------------------------------------------------------
'''
print()




In [23]:
!pip install torchtext==0.4.0
!pip install torchtext
!pip install spacy




In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [84]:
'''
    /*----------------------------- PROJECT_PURPOSE -------------
    | - The main purpose of this program is to demonstrate how Bidirectioanl LSTM-based Deep Neural Network can be used for the development
    |   and evaluation of Gender Prediction from Text (i.e. a Binary Classification Problem). For this purpose, Insha Allah, I will execute the Machine Learning Cycle
    *------------------------------------------------------------
'''
print()




# **Machine Learning Cycle**
## **Four phases of a Machine Learning Cycle are**
### **Training Phase**
  * **Build the Model using Training Data**

### **Testing Phase**
  * **Evaluate the performance of Model using Testing Data**

### **Application Phase**
  * **Deploy the Model in Real-world , to make prediction on Real-time unseen Data**
  
### **Feedback Phase**
  * **Take Feedback form the Users and Domain Experts to improve the Model**


# **Steps – Executing Machine Learning Cycle Using Separate Files**
* **Step 1: Import Libraries**
* **Step 2: Load Training Data, Testing Data and Validation Data**
* **Step 3: Understand and Pre-process Training Data, Testing Data and  Validation Data**
* **Step 4: Represent Training Data, Testing Data and Validation Data in Machine Understandable Format**
* **Step 5: Execute the Training Phase**
* **Step 6: Execute the Testing Phase**
* **Step 7: Execute the Application Phase**
* **Step 8: Execute the Feedback Phase**
* **Step 9: Improve Model Based on Feedback**

# **Step 1: Import Libraries**


In [10]:
'''
    /*----------------------------- IMPORT_LIBRARIES -------------
'''
import os
import re
import time
import spacy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim
from torchtext import vocab
import torch.nn.functional as F
from torchtext.data import Field
from torch.autograd import Variable
from torchtext.vocab import Vectors
from torchtext.data import TabularDataset
device = torch.device("cuda:0")

Mount Google Drive

In [11]:
'''
    /*----------------------------- MOUNT_GOOGLE_DRIVE -------------
      - To connect your colab notebook with google drive
'''
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Step 2: Load Training Data, Testing Data and Validation Data**


In [18]:
'''
    /*----------------------------- LOAD_DATASET -------------
    | Function  : load_dataset()
    | Purpose   : Reads dataset(s) in CSV file format
    | Arguments :
    |       drive_path : Path to dataset file
    |       dataset    : Dataset file name
    | Return    :
    |       dataset    : Dataset in dataframe format
    *---------------------------------------------------------*/
'''
import pandas as pd

def load_dataset(drive_path, dataset):
    loaded_dataset = pd.read_csv(drive_path + dataset)  # Read CSV file
    print("=" * 40)
    print("Dataset Loaded Successfully!")
    print(loaded_dataset.head())  # Print the first few rows of the dataset
    return loaded_dataset  # Return the loaded dataset instead of the filename

# Define your paths
drive_path = '/content/drive/MyDrive/'
dataset = 'gender-sample-data.csv'

# Call the function
data = load_dataset(drive_path, dataset)



Dataset Loaded Successfully!
                                               Quote  Gender
0  Your task is not to seek for love, but merely ...    Male
1  You have to keep breaking your heart until it ...  Female
2  Stop acting so small. You are the universe in ...    Male
3  I’ve learned that people will forget what you ...  Female
4                      What you seek is seeking you.    Male


### **Step 3.1: Pre-process Text**
* Remove Non-alphanumeric Characters
* Lower Case
* Remove Leading and Trailing Whitespaces

In [19]:
'''
    /*----------------------------- DATA_PRE-PROCESSING -------------
    | Function  : data_pre-processing()
    | Purpose   : Performs following pre-processing:
    |              •	Remove non-alphanumeric characters
    |              •	Lower case
    |              •	Remove leading and trailing whitespaces
    | Arguments :
    |       text: Text to be pre-processed
    | Return    :
    |       text: Pre-processed text
    *------------------------------------------------------------------------------------------------*/
'''
def data_pre_processing(text):
      text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # Remove non alphanumeric character
      text = text.lower()                        # Lowercase all text
      return text.strip()                        # Remove leading and trailing whitespaces

### **Step 3.2: Tokenize Text**

In [53]:
'''
    /*----------------------------- TOKENIZE_TEXT -------------
    | Function  : data_tokenization()
    | Purpose   : Tokenizes a Text
    | Arguments :
    |       text: Text to be tokenized
    | Return    :
    |       text: Tokenized Text
    *------------------------------------------------------------------------------------------------*/
'''
import spacy

# Load the spaCy model once, outside the function for efficiency
nlp = spacy.load("en_core_web_sm")  # Use the full model name

def data_tokenization(s):
    return [w.text.lower() for w in nlp(data_pre_processing(s))]  # Use the loaded nlp model


### **Step 3.3: Build Training Data, Testing Data and Validation Data Objects**

In [47]:
def data_objects(drive_path):
    # Declared a Field object
    TEXT = Field(sequential=True, tokenize=data_tokenization, lower=True, include_lengths=True, batch_first=False, init_token='<sos>',
                 eos_token='<eos>')
    LABEL = LabelField(dtype=torch.float)  # Use LabelField directly

    # Load datasets using the correct paths
    training_data, validation_data, testing_data = TabularDataset.splits(
        path=drive_path,
        train='train-data.csv',
        validation='validation_data.csv',
        test='test-data.csv',
        format='csv',
        fields=[('Text', TEXT), ('Gender', LABEL)],
        skip_header=True
    )

    print("\nPre-processed and Tokenized Training Data:")
    print("\n=========================================")
    for i in range(len(training_data)):
        print(training_data[i].Text)

    print("\nPre-processed and Tokenized Validation Data:")
    print("\n=========================================")
    for i in range(len(validation_data)):
        print(validation_data[i].Text)

    print("\nPre-processed and Tokenized Testing Data:")
    print("\n=========================================")
    for i in range(len(testing_data)):
        print(testing_data[i].Text)

    return training_data, validation_data, testing_data, LABEL, TEXT


### **Step 3.4: Load Pre-Trained Word Embedding Vectors**

In [48]:
'''
    /*----------------------------- LOAD_WORD_EMBEDDING_VECTORS -------------
    | Function  : load_word_embedding_vectors()
    | Purpose   : Load pre-trained word embedding vectors from memory
    | Arguments :
    |       drive_path : Path to word embedding vectors file
    | Return    :
    |       vectors     : Loaded word embedding vectors
    *------------------------------------------------------------------------------------------------*/
'''
def load_word_embedding_vectors(drive_path):
  # Load word embedding vectors from memory
  # I have downloaded the Glove word embedding vectors 100d from internet and saved in my drive
  # To use that, I simply give the path of that file and read file in my program using vocab.Vectors function
  vectors = vocab.Vectors('/content/drive/MyDrive/glove.6B.100d.txt', drive_path)
  return vectors

### **Step 3.5: Build Vocabulary**

In [49]:
'''
    /*----------------------------- BUILD_VOCABULARY -------------
    | Function  : build_vocabulary()
    | Purpose   : Build vocabulary from input data
    | Arguments :
    |       pre_processed_training_data   : Pre-processed training data
    |
    |       vectors                       : Word embedding vectors
    |       LABEL                         : LABEL object (Pre-processing applied on output)
    |       TEXT                          : TEXT object (Pre-processing applied on input)
    | Return    :
    |       word_embeddings               : Word embedding vectors mapped on data
    |       vocabulary_size               : Size of vocabulary
    *------------------------------------------------------------------------------------------------*/
'''

def build_vocabulary(training_data, vectors, LABEL, TEXT):
  TEXT.build_vocab(training_data, vectors=vectors, unk_init=torch.Tensor.normal_)   # Build vocabulary from training text
  LABEL.build_vocab(training_data)                   # Build vocabulary from output / labels (Encode all labels)

  print("\n=========================================")
  print("Output/Label word to index dictionary: ", LABEL.vocab.stoi)
  print("\n=========================================")
  print("Input Text word to index dictionary:\n ", TEXT.vocab.stoi,"\n")

  word_embeddings = TEXT.vocab.vectors   # Load vectors
  vocabulary_size = len(TEXT.vocab)      # Size of vocabulary
  return word_embeddings, vocabulary_size

# **Step 4: Represent Training Data, Testing Data and Validation Data in Machine Understandable Format**

In [60]:
'''
    /*----------------------------- REPRESENT_DATA_IN_MACHINE_UNDERSTANDABLE_FORMAT -------------
    | Function  : data_iterators()
    | Purpose   : To build input data (Training, validation and testing data) iterators
    |             (It will convert data into machine understandable format and make data objects which we can iterate over during model training and testing)
    | Arguments :
    |       pre_processed_training_data   : Pre-processed training data
    |       pre_processed_validation_data : Pre-processed validation data
    |       pre_processed_testing_data    : Pre-processed testing data
    | Return    :
    |       training_iterator   : Training data iterator object
    |       validation_iterator : Validation data iterator object
    |       testing_iterator    : Testing data iterator object
    *------------------------------------------------------------------------------------------------*/
'''
from torchtext.data import BucketIterator

def data_iterators(training_data, validation_data, testing_data, batch_size=2):
    # Create BucketIterators for training, validation, and testing data
    training_iterator, validation_iterator, testing_iterator = BucketIterator.splits(
        (training_data, validation_data, testing_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.Text),
        repeat=False,
        shuffle=True,
        sort_within_batch=True
    )

    print("\nTraining Data Tensors Form\n")
    print("="*30, "\n")
    for batch in training_iterator:
        print(batch.Text)

    print("\nValidation Data Tensors Form\n")
    print("="*30, "\n")
    for batch in validation_iterator:
        print(batch.Text)

    print("\nTesting Data Tensors Form\n")
    print("="*30, "\n")
    for batch in testing_iterator:
        print(batch.Text)

    return training_iterator, validation_iterator, testing_iterator


In [61]:
print("+============================Data Preparation============================+\n\n")
drive_path = '/content/drive/MyDrive'

print("---Step 2: Load Training Data, Testing Data and Validation Data---")
print("\nTraining data before pre_processing")
original_training_data = load_dataset(drive_path, "/train-data.csv")

print("\n\nValidation data before pre_processing")
original_validation_data = load_dataset(drive_path, "/validation_data.csv")

print("\n\nTesting data before pre_processing")
original_testing_data = load_dataset(drive_path, "/test-data.csv")

print("\n---Step 3: Understand and Pre-process Training Data, Testing Data and Validation Data---")
print("\n---Step 4: Represent Training Data, Testing Data and Validation Data in Machine Understandable Format---")
preprocessed_training_data, preprocessed_validation_data, preprocessed_testing_data, LABEL, TEXT = data_objects(drive_path)

# Load word embedding vectors from memory
vectors = load_word_embedding_vectors(drive_path)

# Build vocabulary
word_embeddings, vocabulary_size = build_vocabulary(preprocessed_training_data, vectors, LABEL, TEXT)

# Create iterator objects
training_iterator, validation_iterator, testing_iterator = data_iterators(
    preprocessed_training_data,
    preprocessed_validation_data,
    preprocessed_testing_data
)




---Step 2: Load Training Data, Testing Data and Validation Data---

Training data before pre_processing
Dataset Loaded Successfully!
                                                Text  Gender
0  Your task is not to seek for love, but merely ...    Male
1  You have to keep breaking your heart until it ...  Female
2  Stop acting so small. You are the universe in ...    Male
3  I’ve learned that people will forget what you ...  Female
4                      What you seek is seeking you.    Male


Validation data before pre_processing
Dataset Loaded Successfully!
                                                Text  Gender
0  And so it is, that both the devil and the ange...    Male
1  The soul should always stand ajar, ready to we...  Female
2  A man is insensible to the relish of prosperit...    Male
3     Truth is so rare, it is delightful to tell it.  Female


Testing data before pre_processing
Dataset Loaded Successfully!
                                               Quote  Gende

  self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)



Output/Label word to index dictionary:  defaultdict(None, {'Female': 0, 'Male': 1})

Input Text word to index dictionary:
  defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7b5b2ba57640>>, {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, 'you': 4, 'to': 5, 'the': 6, 'is': 7, 'i': 8, 'it': 9, 'a': 10, 'that': 11, 'in': 12, 'not': 13, 'your': 14, 'and': 15, 'of': 16, 'be': 17, 'but': 18, 'for': 19, 'are': 20, 'can': 21, 'one': 22, 'so': 23, 't': 24, 'who': 25, 'with': 26, 'change': 27, 'don': 28, 'has': 29, 'heart': 30, 'if': 31, 'life': 32, 'love': 33, 'seek': 34, 'them': 35, 'there': 36, 'those': 37, 'we': 38, 'what': 39, 'will': 40, 'world': 41, 'all': 42, 'am': 43, 'as': 44, 'forget': 45, 'have': 46, 'how': 47, 'made': 48, 'never': 49, 'people': 50, 'words': 51, 'another': 52, 'anything': 53, 'beauty': 54, 'breaking': 55, 'come': 56, 'else': 57, 'facts': 58, 'feel': 59, 'first': 60, 'gone': 61, 'good': 62, 'know': 63, 'little': 64, 'live': 65,

# **Step 5: Execute the Training Phase**

### **Step 5.1:Model Architecture**

In [62]:
'''
    /*----------------------------- MODEL_ARCHITECTURE -------------
    | Class     : BLSTM()
    | Purpose   : To build the architecture of model to be trained
    *---------------------------------------------------------
    | nn.Module : Base class for all neural network modules. Your models should also subclass this class.
    |
    | Arguments:
    |      output_dim    : 1 (female or male). For output layer number of nodes in output layer will be same as
    |                      number of outputs required in your problem
    |	     hidden_dim    : Size of the hidden layer. Here size of hidden_state of the lstm
    | 		 input_dim     : Size of the vocabulary containing unique words. Total number of unique words in sample data
    |		   embedding_dim : Size of each embedding vector. Here embeddding dimension of GloVe word embedding
    |                      vectors is 100 so embedding_dim = 100
    |		   weights       : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
    *------------------------------------------------------------------------------------
    | Function  : forward()
    | Purpose   : This function will automatically start foward propogation when model object is called
    | Arguments :
    |     text  : Input text of shape = (num_sequences, batch_size)
	  | Return:
	  |     hidden_state : Final model state learned from input text
    ------------------------------------------------------------------------------
'''

class BLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, word_embeddings, n_layers, n_directions):

        super().__init__()

        self.n_layers     = n_layers
        self.n_directions = n_directions
        self.hidden_dim   = hidden_dim
        self.embedding_layer = nn.Embedding(input_dim, embedding_dim)          # Embedding layer shape
        # Assign pre-trained weights and update the weights during backpropagation
        self.embedding_layer.weight = nn.Parameter(word_embeddings, requires_grad = True)
        self.blstm_layer       = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = True) # We can implement multiple layers of lstm simply by changing num_layers value
        self.linear_layer      = nn.Linear(hidden_dim * 2, output_dim)               # Shape of linear layer

    def forward(self, text, seq_length):

        batch_size = text.shape[1]
        h_0, c_0 = self.init_hidden(batch_size)   # Initialize first hidden state to all zeros

        # Here we will map all the indexes present in the input sequence to the corresponding
		    # word vector using our trained word_embedddings.
	      # embedded input of shape = (num_sequences, batch_size, embedding_dimension)
        embedded_vectors = self.embedding_layer(text)
        print(embedded_vectors)

        packed_embedded_vectors = nn.utils.rnn.pack_padded_sequence(embedded_vectors, seq_length)    # pack input sequence


        packed_output_state, (hidden_state, cell_state) = self.blstm_layer(packed_embedded_vectors, (h_0, c_0))  # Apply blstm layer and start learning sequence of words

        output_state, output_seq_length = nn.utils.rnn.pad_packed_sequence(packed_output_state)  # unpack sequence

        hidden_state = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        hidden_state = self.linear_layer(hidden_state)      # Apply the linear layer on hidden_state / context vector
        return torch.sigmoid(hidden_state)
    def init_hidden(self,batch_size):
        h_0 = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_dim)
        c_0 = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_dim)
        return h_0, c_0

### **Step 5.2: Hyperparameters Settings**

In [63]:
'''
/*---------------- INITIALIZE_PARAMETERS ------------------
'''
input_dimension      = len(TEXT.vocab)
embedding_dimension  = 100
hidden_dimension     = 10
output_dimension     = 1
number_of_layers     = 1
number_of_directions = 2
number_of_epochs     = 10

### **Step 5.3: Create Model Object**

In [64]:
"""
/* ----------------------- MODEL_OBJECT -----------------
| Create the object of model class and pass parameters required: BLSTM()
|           Arguments :
|               input_dimension     : (integer) dimension of input layer(vocabulary size)
|               output_dimension    : (integer) number of output layer nodes
|               hidden_dimension    : (integer) number of nodes/units in hidden layer
|               embedding_dimension : (integer) dimension of embedded vector
*-------------------------------------------------------*/
"""
model = BLSTM(input_dimension, embedding_dimension, hidden_dimension, output_dimension, word_embeddings, number_of_layers, number_of_directions)
model

BLSTM(
  (embedding_layer): Embedding(267, 100)
  (blstm_layer): LSTM(100, 10, bidirectional=True)
  (linear_layer): Linear(in_features=20, out_features=1, bias=True)
)

In [65]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding_layer.weight.data[UNK_IDX] = torch.zeros(embedding_dimension)
model.embedding_layer.weight.data[PAD_IDX] = torch.zeros(embedding_dimension)

### **Step 5.4: Initialize Optimizer and Loss Function**

In [66]:
optimizer = optim.SGD(model.parameters(), lr = 1e-3)   # Initialize the optimizer
criterion = nn.BCEWithLogitsLoss()                     # Intialize loss function

### **Step 5.5: Evaluation Measure**

In [67]:
'''
    /*----------------------------- CALCULATE_ACCURACY -------------
    | Function  : calculate_accuracy()
    | Purpose   : Calculate accuracy score
    | Arguments :
    |       prediction : Predicted values
    |       label      : Actual values
    | Return    :
    |       accuracy   : Accuracy score
    *---------------------------------------------------------*/
'''

def calculate_accuracy(prediction, label):

    rounded_preds = torch.round(prediction)                     # Round predictions to the closest integer
    correct       = (rounded_preds == label).float()            # Convert into float for division
    accuracy      = correct.sum() / len(correct)                # Average accuracy
    return accuracy

### **Step 5.6: Calculate Epoch Elapsed Time**

In [68]:
'''
    /*----------------------------- EPOCH_TIME_CALCULATION -------------
    | Function  : epoch_time()
    | Purpose   : Calculate time elapsed in each epoch
    | Arguments :
    |        start_time   : Time when an epoch's execution starts
    |        end_time     : Time when an epoch's execution end
    | Return    :
    |        elapsed_mins : Time consumed by one epoch in minutes
    |        elapsed_secs : Time consumed by one epoch in seconds
    *---------------------------------------------------------*/
'''
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time                   # Time elapsed by one epoch
    elapsed_mins = int(elapsed_time / 60)                  # Convert time in minutes
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) # Convert time in seconds
    return elapsed_mins, elapsed_secs

### **Step 5.7: Train Model**

In [69]:
'''
    /*----------------------------- TRAIN_MODEL -------------
    | Function  : train()
    | Purpose   : Train Model
    | Arguments :
    |        model                 : Model object
    |        training_data_iterator: Training data iterator object
    |        optimizer             : Optimization algorithm
    |        criterion             : Loss funtion
    | Return    :
    |        epoch_loss            : Train data loss at each epoch
    |        epoch_accuracy        : Train data accuracy at each epoch
    *---------------------------------------------------------*/
'''
def train(model, iterator, optimizer, criterion):

    epoch_loss      = 0                                                 # Initialize epoch loss to 0
    epoch_accuracy  = 0                                                 # Initialize epoch accuracy to 0

    model.train()                                                       # Start model training mode

    for batch in iterator:

        optimizer.zero_grad()                                           # Clear all optimized gradients
        text, seq_length = batch.Text
        predictions = model(text, seq_length).squeeze(1)                # Make model predictions on training data

        loss     = criterion(predictions, batch.Gender)                 # Calculate loss for each batch in epoch
        accuracy = calculate_accuracy(predictions, batch.Gender)        # Calculate accuracy for each batch in epoch

        loss.backward()                                                  # Start backward propogation
        optimizer.step()                                                 # Optimization of parameters

        epoch_loss      += loss.item()                                   # Add loss for all batches in one epoch
        epoch_accuracy  += accuracy.item()                               # Add accuracy for all batches in one epoch

    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)    # Average loss and accuracy for one epoch and return

### **Step 5.8: Save Model**

In [71]:
'''
    /*----------------------------- SAVE_MODEL -------------
    | Function  : save_model()
    | Purpose   : Save a trained model on your hard disk
    | Arguments :
    |        drive_path: Path to the directory where the trained model will be saved
    | Return    :
    |        Trained model will be saved on hard disk
    *---------------------------------------------------------*/

'''
def save_model(drive_path):
  torch.save(model.state_dict(), drive_path + '/Trained Model/best-model.pt')

### **Evaluate Model**


*   **Function to be used in Validation and Test Phase**

In [72]:
'''
    /*----------------------------- Evaluate_MODEL -------------
    | Function  : evaluate()
    | Purpose   : Function to be used in Validation and Test Phase
    | Arguments :
    |        model                : Model object
    |        data_iterator:  Data iterator object
    | Return    :
    |        epoch_loss           : Data loss at each epoch
    |        epoch_accuracy       : Data accuracy at each epoch
    *---------------------------------------------------------*/
'''

def evaluate(model, iterator, criterion):

    epoch_loss     = 0      # Initialize epoch loss to 0
    epoch_accuracy = 0      # Initialize epoch accuracy to 0
    model.eval()            # Start model evaluation mode

    with torch.no_grad():

        for batch in iterator:


            text, seq_length = batch.Text
            predictions = model(text, seq_length).squeeze(1)                # Make model predictions on data
            loss = criterion(predictions, batch.Gender)               # Calculate loss for each batch in epoch

            accuracy = calculate_accuracy(predictions, batch.Gender)  # Calculate accuracy for each batch in epoch
            epoch_loss += loss.item()                                 # Add loss for all batches, in one epoch
            epoch_accuracy += accuracy.item()                         # Add accuracy for all batches in one epoch

    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)  # Average loss and accuracy for one epoch and return

# **Step 6: Execute the Validation Phase**

In [73]:
'''
    /*----------------------------- VALIDATE_MODEL -------------
    | Function  : validation()
    | Purpose   : Evalaute the performance of a trained  model
    | Arguments :
    |        model                   : Model object
    |        validation_data_iterator: Validation data iterator object
    |        criterion               : Loss function
    | Return    :
    |        epoch_loss           : Validation data loss at each epoch
    |        epoch_accuracy       : Validation data accuracy at each epoch
    *---------------------------------------------------------*/
'''

def validation(model, validation_iterator, criterion):
      best_validation_loss = float('inf')                                                        # Declare best validation loss variable
      validation_loss, validation_accuracy = evaluate(model, validation_iterator, criterion)     # Start model validation phase

      if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        save_model(drive_path)                                   # Save model on epoch where the validation loss is lowest
      return validation_loss, validation_accuracy

# **Step 7: Execute the Testing Phase**

### **Step 7.1: Load Saved Model**

In [74]:
"""
/*---------------------- LOAD_SAVED_MODEL ----------
|  Function  : load_model()
|  Purpose   : Method to load previously saved model
|  Arguments :
|       drive_path : Path of directory where model is saved
|  Return    :
|              Saved model will be loaded in memory
*---------------------------------------------------------*/
"""
def load_model(drive_path):
  return model.load_state_dict(torch.load(drive_path + '/Trained Model/best-model.pt'))  # Load pre-trained model

### **Step 7.2: Test Model**

In [75]:
'''
    /*----------------------------- TEST_MODEL -------------
    | Function  : test()
    | Purpose   : Evalaute the performance of a trained  model
    | Arguments :
    |        model                : Model object
    |        testing_data_iterator: Test data iterator object
    |        criterion            : Loss function
    | Return    :
    |        epoch_loss           : Test data loss at each epoch
    |        epoch_accuracy       : Test data accuracy at each epoch
    *---------------------------------------------------------*/
'''

def test(model, testing_iterator, criterion):
  load_model(drive_path)
  testing_loss, testing_accuracy = evaluate(model, testing_iterator, criterion)   # Start model testing
  return testing_loss, testing_accuracy

# **Step 8: Execute the Application Phase**

### **Step 8.1: Take Input from User and Convert it into Feature Vector Same as Training Data**

In [76]:
'''
    /*----------------------------- USER_INPUT -------------
    | Function  : take_user_input()
    | Purpose   : Take unseen input from user
    | Arguments :
    |        TEXT : Field object to apply pre-processing on input text (same as sample data)
    | Return    :
    |        user_comment_tensor : User input in machine understandable format
    |----------------------------------------------------------
    | - Let us now predict the gender on a single comment for the real time evaluation purpose
    | 1 : Take input from user
    | 2 : Preprocess the user input
    | 3 : Fit vocabulary previously made for sample data on user input. The indexes assigned for words in
    |     sample data will be assigned to user input. Words in user input that does not appear in
    |     sample data will have zero value
    | 4 : Convert user input to an array
    | 5 : Make tensor from array. As pytorch only work with tensors
    *---------------------------------------------------------*/

'''

def take_user_input(TEXT):
  user_comment = input("Enter comment: ")

  #Preprocess user input
  preprocessed_user_comment = TEXT.preprocess(user_comment)
  preprocessed_user_comment = [TEXT.init_token] + preprocessed_user_comment + [TEXT.eos_token]
  user_comment_vocabulary = [TEXT.vocab.stoi[x] for x in preprocessed_user_comment]
  user_comment_array = np.asarray(user_comment_vocabulary)
  user_comment_tensor = torch.LongTensor(user_comment_array).unsqueeze(1)
  user_comment_tensor = user_comment_tensor
  seq_length          = [len(user_comment_tensor)]
  seq_length          = torch.LongTensor(seq_length)

  print("\nPreprocessed User_input:\n==========================")
  print(preprocessed_user_comment)
  print("\nIdx stored in vocab, corresponding to each word in user_input:\n==========================")
  print(user_comment_vocabulary)
  print("\nUser_input as a tensor:\n==========================" )
  print(user_comment_tensor)

  return user_comment_tensor, seq_length

### **Step 8.2: Load Saved Model**

In [77]:
"""
/*---------------------- LOAD_SAVED_MODEL ----------
|  Function  : load_model()
|  Purpose   : Method to load previously saved model
|  Arguments :
|       drive_path : Path of directory where model is saved
|  Return    :
|              Saved model will be loaded in memory
*---------------------------------------------------------*/
"""
def load_model(drive_path):
  return model.load_state_dict(torch.load(drive_path + '/Trained Model/best-model.pt'))  # Load pre-trained model

### **Step 8.3: Model Prediction**

In [78]:
"""
/*----------------------- MODEL_PREDICTION --------
|  Function  : model_prediction()
|  Purpose   : Use trained model to predict the output of unseen instances
|  Arguments :
|       user_input : Input taken from user
|       drive_path : Path of the directory where trained model is saved
|  Return    :
|       Gender     : Prediction
*--------------------------------------------------
|   1. Set the model to evaluation mode
|   2. Set all the gradients to zero
|   3. Apply trained model on user input
|
|   4. torch.round() :
|         Return the value rounded to the closest integer (0 or 1)
|   5. If returned output is 1 Print "Male" else "Female"
*-------------------------------------------------*/
"""

def model_predictions(user_input, seq_length, drive_path):
  # Evaluate model
  load_model(drive_path)  # Load model from memory to test its performance
  model.eval()
  with torch.no_grad():
    #print(Evaluate_text_tensor)
    # Model Prediction

    out = model(user_input, seq_length)

  if (torch.round(out) == 1):
    Gender = "Male"
  else:
    Gender = "Female"
  return Gender

# **Main Function**

In [82]:
import os
import torch

# Ensure the model directory exists
model_directory = '/content/drive/MyDrive/Trained Model'
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

print("\n+=====================Execute the Training and Validation Phase=====================+\n\n")
# Step 5: Execute the Training Phase

for epoch in range(number_of_epochs):
    start_time = time.time()                                    # Start time for the epoch

    training_loss, training_accuracy = train(model, training_iterator, optimizer, criterion)   # Start training

    # Step 7: Execute the Validation Phase
    validation_loss, validation_accuracy = validation(model, validation_iterator, criterion)

    end_time = time.time()                                       # End time for the epoch
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)    # Calculate time consumed by the epoch

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTraining Loss: {training_loss:.3f}   | Training Accuracy: {training_accuracy*100:.2f}%')
    print(f'\tValidation Loss: {validation_loss:.3f} |  Validation Accuracy: {validation_accuracy*100:.2f}%')

    # Save the model after each epoch
    torch.save(model.state_dict(), f'{model_directory}/model_epoch_{epoch+1}.pth')

print("\n+=====================Execute the Testing Phase=====================+\n\n")
# Step 8: Execute the Testing Phase
testing_loss, testing_accuracy = test(model, testing_iterator, criterion)
print(f'Testing Loss: {testing_loss:.3f} | Testing Accuracy: {testing_accuracy*100:.2f}%')

print("\n+===================Execute the Application Phase===================+\n\n")
# Step 7: Execute the Application Phase

user_input, seq_length = take_user_input(TEXT)   # Take unseen input from the user
Gender = model_predictions(user_input, seq_length, drive_path)  # Make predictions on user input
print('\033[1m',"\n\nTrained Model Prediction")
print('\033[1m',"+","="*30,"+")
print('\033[1m',"|"," "*30,"|\n           Gender : ", Gender,"        \n","|                                |")
print('\033[1m',"+","="*30,"+")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        ...,

        [[ 7.1349e-02,  1.3951e-02,  4.2260e-01,  ..., -1.2532e-01,
           6.4393e-01,  9.4033e-02],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 6.3649e-01,  7.5783e-01,  5.4989e-01,  ..., -2.0935e-01,
           8.6702e-01, -3.6343e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 7.3001e-05,  1.1287e-04, -6.5370e-05,  ..., -8.5521e-05,
          -4.1903e-05,  4.9610e-05],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]]], grad_fn=<EmbeddingBackward0>)
tensor([[[ 5.9949e-06,  1.4958e-05, -2.8043e-06,  ...,  1.0925e-04,
          -5.5876e-05, -1.1781e-04],
         [ 5.9949e-06,  1.4958e-05, -2.8043e-06,  ...,  1.0925e-04,
          -5.5876e-05, -1.1781e-04]],

        [[ 4.7812e-02,  4.4339e-01, -3.3114

  return model.load_state_dict(torch.load(drive_path + '/Trained Model/best-model.pt'))  # Load pre-trained model


tensor([[[ 7.5032e-06,  2.7047e-05, -1.8124e-06,  ...,  1.8482e-04,
          -1.0229e-04, -2.0401e-04],
         [ 7.5032e-06,  2.7047e-05, -1.8124e-06,  ...,  1.8482e-04,
          -1.0229e-04, -2.0401e-04]],

        [[ 1.3020e-02,  3.3335e-01,  6.2812e-01,  ...,  1.0572e-01,
           4.1713e-01,  8.0296e-01],
         [ 1.7660e-01,  5.0064e-01,  8.8375e-03,  ...,  1.7023e-01,
           3.4999e-01,  7.3564e-02]],

        [[-9.3324e-02,  1.9044e-01,  6.8457e-01,  ..., -6.8174e-01,
           2.8803e-01,  5.4892e-01],
         [-3.0663e-01,  1.6820e-01,  9.8512e-01,  ..., -3.8774e-01,
           3.6914e-01,  5.4520e-01]],

        ...,

        [[ 2.6164e-01,  4.4719e-01, -9.6839e-02,  ..., -4.5029e-01,
           4.9525e-01, -2.0299e-01],
         [-1.1939e-01,  5.4189e-01,  6.2174e-01,  ..., -3.8046e-01,
          -4.2245e-02,  3.5156e-01]],

        [[ 8.0187e-02,  4.4799e-01,  7.3870e-01,  ..., -1.3486e-01,
           1.7663e-01,  3.5743e-01],
         [ 1.2642e-04,  1.9553e-0