<a href="https://colab.research.google.com/github/FaizaAli-DS/AI-DS-Projects/blob/main/NLP_module_Project_(Chat_Bot).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project Title: Design and Development of Topical Chabot using Huggingface transformer**

In [1]:
!git clone https://github.com/alexa/Topical-Chat.git

Cloning into 'Topical-Chat'...
remote: Enumerating objects: 262, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 262 (delta 26), reused 15 (delta 3), pack-reused 197[K
Receiving objects: 100% (262/262), 96.25 MiB | 26.63 MiB/s, done.
Resolving deltas: 100% (134/134), done.
Updating files: 100% (119/119), done.


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.2 MB/s[0m eta [36m0:00:0

In [3]:
import json
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
# Here we are defining the sentiment label mapping function
def map_sentiment(sentiment):
    sentiment_mapping = {
        "Angry": 1,
        "Curious to dive deeper": 2,
        "Disguised": 3,
        "Fearful": 4,
        "Happy": 5,
        "Sad": 6,
        "Surprised": 7
    }
    return sentiment_mapping.get(sentiment, 0)

In [5]:
# Load data from a JSON file and preprocess
def load_data(data_path, tokenizer, max_length):
    with open(data_path, "r") as json_file:
        dataset = json.load(json_file)

    input_data = []
    labels = []

    for conversation_id, conversation in dataset.items():
        conversation_history = []
        for turn in conversation["content"]:
            message = turn["message"][0] if isinstance(turn["message"], list) else turn["message"]
            conversation_history.append(message)
        conversation_history_combined = " ".join(conversation_history)

        # Tokenize and pad the conversation history
        tokenized_data = tokenizer(
            conversation_history_combined,
            padding="max_length",
            truncation=True,
            return_tensors="tf",
            max_length=max_length
        )

        if "content" in conversation and conversation["content"]:
            last_turn = conversation["content"][-1]
            if "sentiment" in last_turn:
                conversation_rating = last_turn["sentiment"]
                label = map_sentiment(conversation_rating)
            else:
                label = 0
        else:
            label = 0  # Assign a default label when 'content' is missing

        input_data.append({
            "input_ids": tokenized_data["input_ids"][0],
            "attention_mask": tokenized_data["attention_mask"][0]
        })
        labels.append(label)

    return input_data, labels

In [6]:
# Here in this code, we will train the model with early stopping
def train_model(model, train_dataset, val_dataset, epochs, batch_size):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping]
    )

    return model, history

In [8]:
# Evaluate the model on the test dataset
def evaluate_model(model, test_dataset):
    predictions = model.predict(test_dataset)
    predicted_labels = tf.argmax(predictions, axis=1)

    true_labels = evaluate_model()
    evaluate_model(model, (test_inputs, true_labels))
    evaluate_model(model, (test_inputs, true_labels))



    # Print a classification report with precision, recall, F1-score, etc.
    print(classification_report(true_labels, predicted_labels))

In [12]:
def main():
    data_path = "/content/drive/MyDrive/NLPChatbotData/train.json"
    max_length = 100
    batch_size = 64
    epochs = 50  # Increase the number of epochs if needed

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    num_labels = 8
    input_data, labels = load_data(data_path, tokenizer, max_length)

    # Split your data into train, validation, and test sets
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_data, labels, test_size=0.2)
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, test_size=0.1)

    # Create and compile your model
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Train your model
    model, history = train_model(model, (train_inputs, train_labels), (val_inputs, val_labels), epochs, batch_size)

    # Evaluate the model on the test dataset
    evaluate_model(model, (test_inputs, test_labels))

    # Save the trained model and weights for future use
    model.save("Chatbot_trained_model")
    if __name__ == "__main__":
      main()