# Colab as Machine Learning Backend

## Check NVIDIA graphic card info

In [None]:
!nvidia-smi

Sat Jun 19 07:37:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Install packages

In [None]:
!pip install transformers==4.5.0
!pip install accelerate==0.2.0
!pip install flask-cors

Collecting transformers==4.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 15.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 58.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 45.8MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.5

## Import packages

In [None]:
import time

import torch
from accelerate import Accelerator
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BertForQuestionAnswering,
    BertTokenizerFast,
    pipeline,
    BertTokenizer, 
    GPT2LMHeadModel, 
    TextGenerationPipeline
)

accelerator = Accelerator(fp16=True)
device = accelerator.device


## Download Pretrained Model

In [None]:
!gdown --id "1mc-1dpHli8G1JTcAHskewosPloQrrQjF" --output saved_model.zip

!unzip -o saved_model.zip

Downloading...
From: https://drive.google.com/uc?id=1mc-1dpHli8G1JTcAHskewosPloQrrQjF
To: /content/saved_model.zip
1.20GB [00:04, 249MB/s]
Archive:  saved_model.zip
  inflating: saved_model/config.json  
  inflating: saved_model/pytorch_model.bin  


## Automatic mixed precision training (fp16)	

## Load Model and Tokenizer

In [None]:
model = BertForQuestionAnswering.from_pretrained("saved_model")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
model = accelerator.prepare(model) 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [None]:
advanced_model = AutoModelForQuestionAnswering.from_pretrained('uer/roberta-base-chinese-extractive-qa')
advanced_tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-chinese-extractive-qa')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=452.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=406799727.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=216.0, style=ProgressStyle(description_…




## Dataloader

In [None]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150
        
        self.doc_stride = 4

        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]		
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask


## Function for Evaluation

In [None]:
def evaluate(data, output):
    answer = ""
    max_prob = float("-inf")
    num_of_windows = data[0].shape[1]
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        
        # Replace answer if calculated probability is larger than previous windows
        if prob > max_prob:
            max_prob = prob
            # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
    
    # Remove spaces in answer (e.g. "大 金" --> "大金")
    return answer.replace(' ','')

In [None]:
QA = pipeline('question-answering', model=advanced_model, tokenizer=advanced_tokenizer)

## Prediction function

In [None]:
def predict_answer(QUESTION_TEXT: str, PARAGRAPH_TEXT: str):
    start = time.time()
    test_questions = {
        "questions": [{
            "id": 0,
            "paragraph_id": 0,
            "question_text": QUESTION_TEXT,
            "answer_text": None,
            "answer_start": None,
            "answer_end": None
            }],
        "paragraphs": [PARAGRAPH_TEXT]
    }

    test_questions, test_paragraphs = test_questions["questions"], test_questions["paragraphs"]

    test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False)
    test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

    test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)
    test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

    # Start Predict
    model.eval()
    with torch.no_grad():
        for data in tqdm(test_loader):
            output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device), attention_mask=data[2].squeeze(dim=0).to(device))
            answer = evaluate(data, output)
        # Unknown character in answer
        if "[UNK]" in answer:
            return advenced_predict_answer(QUESTION_TEXT=QUESTION_TEXT, PARAGRAPH_TEXT=PARAGRAPH_TEXT)
        else:
            return {"answer": answer, "execution": time.time() - start}

In [None]:
def advenced_predict_answer(QUESTION_TEXT: str, PARAGRAPH_TEXT: str):
    start = time.time()
    data = {"question": QUESTION_TEXT, "context": PARAGRAPH_TEXT}
    result = QA(data)
    result["execution"] = time.time() - start
    return result

## Run ngrok with Flask

In [None]:
import atexit
import json
import os
import platform
import shutil
import subprocess
import tempfile
import time
import zipfile
from pathlib import Path
from threading import Timer

import requests


def _get_command():
    system = platform.system()
    if system == "Darwin":
        command = "ngrok"
    elif system == "Windows":
        command = "ngrok.exe"
    elif system == "Linux":
        command = "ngrok"
    else:
        raise Exception("{system} is not supported".format(system=system))
    return command


def _run_ngrok(port):
    command = _get_command()
    ngrok_path = str(Path(tempfile.gettempdir(), "ngrok"))
    _download_ngrok(ngrok_path)
    executable = str(Path(ngrok_path, command))
    os.chmod(executable, 0o777)
    ngrok = subprocess.Popen([executable, 'http', str(port)])
    atexit.register(ngrok.terminate)
    localhost_url = "http://localhost:4040/api/tunnels"  # Url with tunnel details
    time.sleep(1)
    tunnel_url = requests.get(localhost_url).text  # Get the tunnel information
    j = json.loads(tunnel_url)

    tunnel_url = j['tunnels'][0]['public_url']  # Do the parsing of the get
    return tunnel_url


def _download_ngrok(ngrok_path):
    if Path(ngrok_path).exists():
        return
    system = platform.system()
    if system == "Darwin":
        url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-darwin-amd64.zip"
    elif system == "Windows":
        url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-windows-amd64.zip"
    elif system == "Linux":
        url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip"
    else:
        raise Exception(f"{system} is not supported")
    download_path = _download_file(url)
    with zipfile.ZipFile(download_path, "r") as zip_ref:
        zip_ref.extractall(ngrok_path)


def _download_file(url):
    local_filename = url.split('/')[-1]
    r = requests.get(url, stream=True)
    download_path = str(Path(tempfile.gettempdir(), local_filename))
    with open(download_path, 'wb') as f:
        shutil.copyfileobj(r.raw, f)
    return download_path


def start_ngrok(port):
    ngrok_address = _run_ngrok(port)
    while "https" not in ngrok_address:
        ngrok_address = _run_ngrok(port)
    # Callback Heroku server
    data = {"server_url": ngrok_address}
    requests.post("https://regtech-rm.herokuapp.com/api/set_server", json=data)
    print(f" * Running on {ngrok_address}")


def run_with_ngrok(app):
    """
    The provided Flask app will be securely exposed to the public internet via ngrok when run,
    and the its ngrok address will be printed to stdout
    :param app: a Flask application object
    :return: None
    """
    old_run = app.run

    def new_run(*args, **kwargs):
        port = kwargs.get('port', 5000)
        thread = Timer(1, start_ngrok, args=(port,))
        thread.setDaemon(True)
        thread.start()
        old_run(*args, **kwargs)
    app.run = new_run

## Start Flask Server

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
app.config["JSON_SORT_KEYS"] = False
CORS(app)
run_with_ngrok(app)   # Starts ngrok when the app is running

@app.route("/")
def home():
    return "<h1>This is the model predict server on Google Colab!</h1>"

@app.route("/api/predict", methods=["POST"])
def api_predict():
    QUESTION_TEXT = request.get_json(force = True).get("QUESTION_TEXT")
    PARAGRAPH_TEXT = request.get_json(force = True).get("PARAGRAPH_TEXT")
    advanced = request.get_json(force = True).get("advanced")
    if not advanced:
        result = predict_answer(QUESTION_TEXT=QUESTION_TEXT, PARAGRAPH_TEXT=PARAGRAPH_TEXT)
    else:
        result = advenced_predict_answer(QUESTION_TEXT=QUESTION_TEXT, PARAGRAPH_TEXT=PARAGRAPH_TEXT)
    response = jsonify(result)
    response.headers.add("Access-Control-Allow-Origin", "*")
    return response

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on https://cb4d2f605458.ngrok.io


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

127.0.0.1 - - [19/Jun/2021 07:39:37] "[37mPOST /api/predict HTTP/1.1[0m" 200 -





HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

127.0.0.1 - - [19/Jun/2021 07:39:41] "[37mPOST /api/predict HTTP/1.1[0m" 200 -



