Installing necessary libraries

In [None]:
!pip install transformers torch tensorflow mlflow flask fastapi kafka-python prometheus_client opencensus

Collecting mlflow
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting fastapi
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting opencensus
  Downloading opencensus-0.11.4-py2.py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.19.0->mlflow)
  Downloading databricks_sdk-0.41.0-py3-n

Importing libraries

In [None]:
import transformers
import torch
import tensorflow as tf
import mlflow
from flask import Flask, request, jsonify
from fastapi import FastAPI
import kafka
from prometheus_client import start_http_server, Summary
import opencensus


Loading pre-trained LLM i.e GPT-2

In [None]:
#import GPT-2 model for language modelling
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# initializing the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# initializing the model
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

just checking

In [None]:
input_text = "Hello, how are you?"
tokens = tokenizer.encode(input_text)  # Converts text into token IDs
print(tokens)


[15496, 11, 703, 389, 345, 30]


just checking 2.0

In [None]:
input_text = "The fat cat sat"
input_ids = tokenizer.encode(input_text, return_tensors="pt")  # Convert input text to token IDs
output = model.generate(input_ids, max_length=50)  # Generate text up to 50 tokens
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)  # Decode token IDs back to text
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The fat cat sat in the back seat of the car, and the fat cat sat in the front seat.

"I'm not going to lie, I'm not going to lie," he said. "I'm not going to lie.


Defining input and output - middleware


In [None]:
# initiate Flask application
app = Flask(__name__)

@app.route('/generate', methods=['POST'])
def generate_text():
    input_text = request.json['input_text']
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(inputs, max_length=20, num_return_sequences=1)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return jsonify({'generated_text': generated_text})

if __name__ == '__main__':
    app.run() # <--
    # check this <--


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Creating a feedback loop - operational layer

In [None]:
import mlflow

def track_performance_and_log_feedback(input_text, generated_text):
    mlflow.start_run()
    mlflow.log_param("model", "gpt2")
    mlflow.log_param("framework", "transformers")
    mlflow.log_param("input_text", input_text)
    mlflow.log_param("generated_text", generated_text)
    response_time = 1.2 # wait for the response time and then see to add the real value
    mlflow.log_metric("response_time", response_time)
    mlflow.end_run()


Set up monitoring. Using Prometheus. (ongoing)

In [None]:
from prometheus_client import Summary, start_http_server

REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request')

@app.route('/generate', methods=['POST'])
def generate_text():
    with REQUEST_TIME.time():
        input_text = request.json['input_text']
        inputs = tokenizer.encode(input_text, return_tensors='pt')
        outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        track_performance_and_log_feedback(input_text, generated_text)
        return jsonify({'generated_text': generated_text})

if __name__ == '__main__':
    start_http_server(8000)
    app.run()


ValueError: Duplicated timeseries in CollectorRegistry: {'request_processing_seconds', 'request_processing_seconds_sum', 'request_processing_seconds_created', 'request_processing_seconds_count'}

Deploy and testing (ongoing)

In [None]:
import requests

response = requests.post('http://localhost:5000/generate', json={'input_text': 'Once upon a time'})
print(response.json())


ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /generate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7879fbf43510>: Failed to establish a new connection: [Errno 111] Connection refused'))