#### Development of multitask sequence labeling model for two tasks, where 
 - Task 1 is word-level sequence labeling.
 - Task 2 is sentence-level labeling. Prepare a model in Keras that performs both word and sentence-level  labeling simultaneously.

### Creating a Datasets

In [103]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Toy dataset for word-level NER task
sentences = [
    "John lives in New York.",
    "Apple is a company based in California.",
    "I love my dog.",
    "The Eiffel Tower is in Paris."
]

ner_labels = [
    ["B-PER", "O", "O", "B-LOC", "B-LOC"],
    ["B-ORG", "O", "O", "O", "O", "O", "B-LOC"],
    ["O", "O", "O", "O"],
    ["O", "B-LOC", "O", "O", "O", "B-LOC"]
]

# Sentiment labels (0: Negative, 1: Neutral, 2: Positive)
sentiment_labels = [2, 1, 2, 2]

In [4]:
!pip install fastcore

Collecting fastcore
  Using cached fastcore-1.5.29-py3-none-any.whl (67 kB)
Installing collected packages: fastcore
Successfully installed fastcore-1.5.29


In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
     ---------------------------------------- 7.5/7.5 MB 21.8 MB/s eta 0:00:00
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.3-cp39-cp39-win_amd64.whl (266 kB)
     ------------------------------------- 266.4/266.4 kB 16.0 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 13.8 MB/s eta 0:00:00
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.32.1


In [9]:
from fastcore.utils import store_attr
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AdamW
import pandas as pd
from transformers import get_linear_schedule_with_warmup
from hinglishutils import (
    add_padding,
    check_for_gpu,
    create_attention_masks,
    evaulate_and_save_prediction_results,
    load_lm_model,
    load_masks_and_inputs,
    load_sentences_and_labels,
    make_dataloaders,
    modify_transformer_config,
    save_model,
    set_seed,
    tokenize_the_sentences,
    train_model,
)
from datetime import datetime
import wandb


class HinglishTrainer:
    def __init__(
        self,
        model_name: str,
        batch_size: int = 8,
        attention_probs_dropout_prob: float = 0.4,
        learning_rate: float = 5e-7,
        adam_epsilon: float = 1e-8,
        hidden_dropout_prob: float = 0.3,
        epochs: int = 3,
        lm_model_dir: str = None,
        wname=None,
        drivepath="../drive/My\ Drive/HinglishNLP/repro",
    ):
        store_attr()
        self.timestamp = str(datetime.now().strftime("%d.%m.%y"))
        if not self.wname:
            self.wname = self.model_name
        wandb.init(
            project="hinglish",
            config={
                "model_name": self.model_name,
                "batch_size": self.batch_size,
                "attention_probs_dropout_prob": self.attention_probs_dropout_prob,
                "learning_rate": self.learning_rate,
                "adam_epsilon": self.adam_epsilon,
                "hidden_dropout_prob": self.hidden_dropout_prob,
                "epochs": self.epochs,
            },
            name=f"{self.wname} {self.timestamp}",
        )
        print({"Model Info": f"Setup self.model training for {model_name}"})
        self.device = check_for_gpu(self.model_name)
        if not lm_model_dir:
            if self.model_name == "bert":
                self.lm_model_dir = "model_save"
            elif self.model_name == "distilbert":
                self.lm_model_dir = "distilBert6"
            elif self.model_name == "roberta":
                self.lm_model_dir = "roberta3"

    def setup(self):
        sentences, labels, self.le = load_sentences_and_labels()
        self.tokenizer, input_ids = tokenize_the_sentences(
            sentences, self.model_name, self.lm_model_dir
        )
        input_ids, self.MAX_LEN = add_padding(
            self.tokenizer, input_ids, self.model_name
        )
        attention_masks = create_attention_masks(input_ids)
        (
            train_inputs,
            train_masks,
            train_labels,
            validation_inputs,
            validation_masks,
            validation_labels,
        ) = load_masks_and_inputs(input_ids, labels, attention_masks)
        self.config = modify_transformer_config(
            "bert",
            self.batch_size,
            self.attention_probs_dropout_prob,
            self.learning_rate,
            self.adam_epsilon,
            self.hidden_dropout_prob,
            self.lm_model_dir,
        )
        self.train_dataloader, self.validation_dataloader = make_dataloaders(
            train_inputs,
            train_masks,
            train_labels,
            self.batch_size,
            validation_inputs,
            validation_masks,
            validation_labels,
        )

    def train(self):
        self.setup()
        self.model = load_lm_model(self.config, self.model_name, self.lm_model_dir)
        optimizer = AdamW(
            self.model.parameters(),
            lr=self.learning_rate,
            eps=self.adam_epsilon,
        )
        total_steps = len(self.train_dataloader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=100,
            num_training_steps=total_steps,
        )
        set_seed()
        loss_values = []
        train_model(
            self.epochs,
            self.model,
            self.train_dataloader,
            self.device,
            optimizer,
            scheduler,
            loss_values,
            self.model_name,
            self.validation_dataloader,
        )

    def evaluate(
        self,
        dev_json="test.json",
        test_json="final_test.json",
        test_labels="test_labels_hinglish.txt",
    ):
        output = evaulate_and_save_prediction_results(
            self.tokenizer,
            self.MAX_LEN,
            self.model,
            self.device,
            self.le,
            final_name=dev_json,
            name=self.model_name,
        )

        full_output = evaulate_and_save_prediction_results(
            self.tokenizer,
            self.MAX_LEN,
            self.model,
            self.device,
            self.le,
            final_name=test_json,
            name=self.model_name,
        )
        l = pd.read_csv(test_labels)
        prf = precision_recall_fscore_support(
            full_output["Sentiment"], l["Sentiment"], average="macro"
        )
        wandb.log({"Precision": prf[0], "Recall": prf[1], "F1": prf[2]})
        wandb.log(
            {"Accuracy": str(accuracy_score(full_output["Sentiment"], l["Sentiment"]))}
        )
        save_model(full_output, self.model, self.tokenizer, self.model_name)


ModuleNotFoundError: No module named 'gdown'

### Process Data

In [104]:
# Create vocabulary and label sets
vocab = set(word for sentence in sentences for word in sentence.split())
ner_tags = set(tag for tags in ner_labels for tag in tags)
num_classes_ner = len(ner_tags)
num_classes_sentiment = 3  # Negative, Neutral, Positive

# Create word and label dictionaries
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}
ner2idx = {tag: idx for idx, tag in enumerate(ner_tags)}

# Convert sentences to numerical sequences and labels to numerical sequences
X_word = [[word2idx[word] for word in sentence.split()] for sentence in sentences]
y_ner = [[ner2idx[tag] for tag in tags] for tags in ner_labels]

# Pad sequences
max_sequence_length = max(len(seq) for seq in X_word)
X_word = pad_sequences(X_word, maxlen=max_sequence_length, padding='post')
y_ner = pad_sequences(y_ner, maxlen=max_sequence_length, padding='post')

# Convert NER labels to one-hot encoded format
y_ner = np.array([to_categorical(seq, num_classes=num_classes_ner) for seq in y_ner])
y_sentiment = to_categorical(sentiment_labels, num_classes=num_classes_sentiment)

### Building Model

In [105]:
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, GlobalMaxPooling1D
from keras.models import Model

# Input layer
word_input = Input(shape=(max_sequence_length,))

# Shared embedding layer
embedding_dim = 50
embedding_layer = Embedding(input_dim=len(vocab) + 1, output_dim=embedding_dim, input_length=max_sequence_length)
word_embed = embedding_layer(word_input)

# Shared LSTM layer
lstm_units = 64
lstm_layer = LSTM(lstm_units, return_sequences=True)
word_lstm = lstm_layer(word_embed)

# Word-level NER output
ner_output = TimeDistributed(Dense(num_classes_ner, activation='softmax'))(word_lstm)

# Sentence-level Sentiment output
sentence_lstm = LSTM(lstm_units)(word_lstm)
sentiment_output = Dense(num_classes_sentiment, activation='softmax')(sentence_lstm)

# Create the multitask model
model = Model(inputs=word_input, outputs=[ner_output, sentiment_output])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print("Model summary:")
model.summary()


Model summary:
Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_21 (InputLayer)          [(None, 7)]          0           []                               
                                                                                                  
 embedding_15 (Embedding)       (None, 7, 50)        1000        ['input_21[0][0]']               
                                                                                                  
 lstm_23 (LSTM)                 (None, 7, 64)        29440       ['embedding_15[0][0]']           
                                                                                                  
 lstm_24 (LSTM)                 (None, 64)           33024       ['lstm_23[0][0]']                
                                                                            

In [119]:
# Train the model
model.fit(X_word, [y_ner, y_sentiment], epochs=90, batch_size=2, verbose=1)

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90


Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
Epoch 85/90
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90


<keras.callbacks.History at 0x2d0ac83ac40>

### Make Prediction

In [120]:
# Make predictions
ner_predictions, sentiment_predictions = model.predict(X_word)

# Print sample predictions
sample_idx = 1
sample_sentence = sentences[sample_idx]
sample_ner_pred = np.argmax(ner_predictions[sample_idx], axis=-1)
sample_sentiment_pred = np.argmax(sentiment_predictions[sample_idx])

print("Sample Sentence:", sample_sentence)
print("NER Predictions:", [list(ner2idx.keys())[idx] for idx in sample_ner_pred])
print("Sentiment Prediction:", sample_sentiment_pred)


Sample Sentence: Apple is a company based in California.
NER Predictions: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC']
Sentiment Prediction: 1


In [123]:
# Make predictions
ner_predictions, sentiment_predictions = model.predict(X_word)

# Print sample predictions
sample_idx = 2
sample_sentence = sentences[sample_idx]
sample_ner_pred = np.argmax(ner_predictions[sample_idx], axis=-1)
sample_sentiment_pred = np.argmax(sentiment_predictions[sample_idx])

print("Sample Sentence:", sample_sentence)
print("NER Predictions:", [list(ner2idx.keys())[idx] for idx in sample_ner_pred])
print("Sentiment Prediction:", sample_sentiment_pred)


Sample Sentence: I love my dog.
NER Predictions: ['O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC']
Sentiment Prediction: 2
