In [1]:
# Step 1: Setup and Install Required Libraries
# Install necessary libraries for the project
!pip install datasets transformers nltk spacy mlflow flask

# Download necessary NLTK data and SpaCy model
import nltk
nltk.download('punkt')
!python -m spacy download en_core_web_sm


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


**Step 2: Data Collection and Preprocessing
**

In [2]:
# Load the dataset from Hugging Face
from datasets import load_dataset

# Load the 'daily_dialog' dataset
dataset = load_dataset("daily_dialog")
data = dataset['train']

# Import necessary libraries for preprocessing
import re
import spacy
from nltk.tokenize import word_tokenize

# Load the SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespaces
    return text

# Function for tokenization and lemmatization
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenize using NLTK
    doc = nlp(' '.join(tokens))  # Use SpaCy for lemmatization
    return ' '.join([token.lemma_ for token in doc])

# Clean and preprocess the dataset
data = data.map(lambda x: {"text": preprocess_text(clean_text(' '.join(x["dialog"][:100])))})
# Save preprocessed data to a CSV file for later use
data.to_csv("cleaned_conversations.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


daily_dialog.py:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

The repository for daily_dialog contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/daily_dialog.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

10974487

**Step 3: Model Training with MLflow**

In [None]:
# Import necessary libraries for model training
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import mlflow
import mlflow.pytorch

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


# Assign the eos_token as the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize the data with the updated tokenizer
tokenized_data = data.map(tokenize_function, batched=True)


# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-chatbot",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
)

# Start an MLflow run for tracking
mlflow.start_run(run_name="GPT-2 Chatbot Training")
mlflow.log_param("model_type", "GPT-2")
mlflow.log_param("learning_rate", training_args.learning_rate)
mlflow.log_param("epochs", training_args.num_train_epochs)

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Log the model with MLflow
mlflow.pytorch.log_model(model, "model")
mlflow.end_run()


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

2024/10/16 22:10:51 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 5c0b168de6f24116876b5d6dca51a3ae: Failed to log run data: Exception: Changing param values is not allowed. Param with key='model_type' was already logged with value='GPT-2' for run ID='5c0b168de6f24116876b5d6dca51a3ae'. Attempted logging new value 'gpt2'.


Epoch,Training Loss,Validation Loss


**Step 4: Evaluate the Model
**

In [None]:
# Function to generate responses
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot with sample prompts
test_prompts = ["Hello, how are you?", "Tell me a joke.", "What is AI?"]
responses = [generate_response(prompt) for prompt in test_prompts]

# Print responses for evaluation
for prompt, response in zip(test_prompts, responses):
    print(f"Prompt: {prompt}\nResponse: {response}\n")

# Log evaluation results with MLflow
mlflow.start_run(run_name="GPT-2 Chatbot Evaluation")
for prompt, response in zip(test_prompts, responses):
    mlflow.log_text(f"Prompt: {prompt}\nResponse: {response}", f"{prompt}.txt")
mlflow.end_run()


**Step 5: Deploy the Model using Flask**

In [None]:
# Write the Flask app in a .py file for deployment
%%writefile app.py
from flask import Flask, request, jsonify
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-chatbot")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

app = Flask(__name__)

@app.route('/chat', methods=['POST'])
def chat():
    prompt = request.json.get('prompt')
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return jsonify({"response": response})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


In [None]:
#Run the Flask App:
!python app.py