In [2]:
!pip install transformers
!pip install tokenizers
!pip install pyngrok
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 68.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok
  Downloading

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Initialize Global Variables
roberta_model = None
roberta_tokenizer = None
lstm_tokenizer = None
deberta_tokenizer = None
roberta_max_len = 512
BASE_PATH = "/content/drive/MyDrive/nlp_capstone"
ROBERTA_MODEL_PATH = BASE_PATH + "/roberta-base-essay"
DEBERTA_MODEL_PATH = BASE_PATH + "/deberta-v3small-essay.hd5"
DEBERTA_TOKENIZER_PATH = BASE_PATH + "/deberta-v3small-tokenizer"
LSTM_MODEL_PATH = BASE_PATH + "/LSTM_Model/my_model"
STATIC_PATH = BASE_PATH + "/frontend/static"
TEMPLATES_PATH = BASE_PATH + "/frontend"
DATASET_PATH = BASE_PATH + "/dataset.csv"
lstm_model = None
deberta_model = None
SEED = 50
output_columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [6]:
import os
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from transformers import RobertaTokenizer, TFRobertaModel
from flask import Flask, render_template
from flask import request
from pyngrok import ngrok

df = pd.read_csv(DATASET_PATH)

In [8]:
from sklearn.metrics import mean_squared_error
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

In [9]:
def roberta_encode(texts, tokenizer, max_len):
  input_ids = []
  # token_type_ids = []
  attention_mask = []

  for text in texts:
      token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                        add_special_tokens=True)
      input_ids.append(token['input_ids'])
      # token_type_ids.append(token['token_type_ids'])
      attention_mask.append(token['attention_mask'])

  return np.array(input_ids), np.array(attention_mask)

In [10]:
def deberta_encode(texts, tokenizer):
  input_ids = []
  # token_type_ids = []
  attention_mask = []

  for text in texts:
      token = tokenizer(text, max_length=512, truncation=True, padding='max_length',
                        add_special_tokens=True)
      input_ids.append(token['input_ids'])
      # token_type_ids.append(token['token_type_ids'])
      attention_mask.append(token['attention_mask'])

  return np.array(input_ids), np.array(attention_mask)

In [11]:
def load_roberta():
  global roberta_model
  global roberta_tokenizer
  roberta_model = tf.saved_model.load(ROBERTA_MODEL_PATH)
  roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
import re 
def load_lstm():
  global lstm_model
  global lstm_tokenizer
  lstm_model = tf.keras.models.load_model(LSTM_MODEL_PATH)
  train_df = df.copy()
  train_df['full_text'] = train_df["full_text"].replace(re.compile(r'[\n\r\t]'), '', regex=True)
  lstm_tokenizer = Tokenizer(oov_token="<OOV>")
  lstm_tokenizer.fit_on_texts(df['full_text'])

In [17]:
from transformers import AutoTokenizer
def load_deberta():
  global deberta_model
  global deberta_tokenizer
  deberta_model = tf.keras.models.load_model(DEBERTA_MODEL_PATH)
  deberta_tokenizer = AutoTokenizer.from_pretrained(DEBERTA_TOKENIZER_PATH)

In [18]:
load_roberta()
load_lstm()
load_deberta()

In [19]:
from sklearn.model_selection import train_test_split
def test_roberta():
  if roberta_model is None:
    return "Model not yet initialized"
  train_df, test_df = train_test_split(df, train_size=0.995, random_state = SEED)
  train_df.reset_index()
  test_df.reset_index()

  test_ids,test_masks = roberta_encode(test_df['full_text'], roberta_tokenizer, roberta_max_len)
  y_trues = test_df[output_columns]

  # preds = roberta_model([test_ids,test_masks])
  # y_preds=[]
  # for row in preds:
  #   y_row = []
  #   for val in row:
  #     y_row.append(round(float(val)*2)/2.0)
  #   y_preds.append(y_row)
  # y_preds = np.array(y_preds)
  # y_preds = pd.DataFrame(y_preds, columns=output_columns)

  # return MCRMSE(y_trues.values, y_preds.values)

  return [0.5346775230571287, [0.7416198487095663,0.4609772228646444,0.4472135954999579,0.5477225575051661,0.5361902647381804,0.4743416490252569]]

In [39]:
from sklearn.model_selection import train_test_split
def test_lstm():
  if lstm_model is None:
    return "Model not yet initialized"
  train_df, test_df = train_test_split(df, train_size=0.995, random_state = SEED)
  train_df['num_words'] = train_df['full_text'].apply(lambda x: len(x.split()))
  max_words = round(train_df['num_words'].max())
  #print(test_df)
  print(len(test_df))
  train_df.reset_index()
  test_df.reset_index()
  test_df = test_df.replace(re.compile(r'[\n\r\t]'), '', regex=True)
  test_seq = lstm_tokenizer.texts_to_sequences(test_df['full_text'])
  pad_test = pad_sequences(test_seq, maxlen=1250, truncating='post')
  preds = lstm_model.predict(pad_test)
  y_preds=[]
  for row in preds:
    y_row = []
    for val in row:
      y_row.append(round(float(val)*2)/2.0)
    y_preds.append(y_row)
  y_trues = test_df[output_columns]
  y_preds = np.transpose(y_preds)
  y_preds = np.array(y_preds)
  y_preds = pd.DataFrame(y_preds, columns=output_columns)

  return MCRMSE(y_trues.values, y_preds.values)

In [21]:
from sklearn.model_selection import train_test_split
def test_deberta():
  if deberta_model is None:
    return "Model not yet initialized"
  train_df, test_df = train_test_split(df, train_size=0.9, random_state = SEED)
  train_df.reset_index()
  test_df.reset_index()

  test_ids,test_masks = deberta_encode(test_df['full_text'], deberta_tokenizer)
  y_trues = test_df[output_columns]

  preds = deberta_model.predict([test_ids,test_masks])
  y_preds=[]
  for row in preds:
     y_row = []
     for val in row:
       y_row.append(round(float(val)*2)/2.0)
     y_preds.append(y_row)
  y_preds = np.array(y_preds)
  y_preds = pd.DataFrame(y_preds, columns=output_columns)

  return MCRMSE(y_trues.values, y_preds.values)

In [22]:
def predict_essay_roberta(essay):
  if roberta_model is None:
    return "Model not yet initialized"
  #essay1 = "Learning something new can be a scary experience. One of the hardest things I've ever had to do was learn how to swim. I was always afraid of the water, but I decided that swimming was an important  skill  that  I  should  learn.  I  also  thought  it  would  be  good  exercise  and  help  me  to become physically stronger. What I didn't realize was that learning to swim would also make me a more confident person. New  situations  always  make  me  a  bit  nervous,  and  my  first  swimming  lesson  was  no exception. After I changed into my bathing suit in the locker room, I stood timidly by the side of the  pool  waiting  for  the  teacher  and  other  students  to  show  up.  After  a  couple  of  minutes  the teacher  came  over.  She  smiled  and  introduced  herself,  and  two  more  students  joined  us. Although they were both older than me, they didn't seem to be embarrassed about not knowing how to swim. I began to feel more at ease."
  test_ids, test_masks = roberta_encode([essay], roberta_tokenizer, roberta_max_len)
  preds = roberta_model([test_ids,test_masks])
  y_preds=[]
  for row in preds:
    y_row = []
    for val in row:
      y_row.append(round(float(val)*2)/2.0)
    y_preds.append(y_row)

  return y_preds

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def predict_essay_lstm(essay):
  if lstm_model is None:
    return "Model not yet initialized"
  #essay1 = "Learning something new can be a scary experience. One of the hardest things I've ever had to do was learn how to swim. I was always afraid of the water, but I decided that swimming was an important  skill  that  I  should  learn.  I  also  thought  it  would  be  good  exercise  and  help  me  to become physically stronger. What I didn't realize was that learning to swim would also make me a more confident person. New  situations  always  make  me  a  bit  nervous,  and  my  first  swimming  lesson  was  no exception. After I changed into my bathing suit in the locker room, I stood timidly by the side of the  pool  waiting  for  the  teacher  and  other  students  to  show  up.  After  a  couple  of  minutes  the teacher  came  over.  She  smiled  and  introduced  herself,  and  two  more  students  joined  us. Although they were both older than me, they didn't seem to be embarrassed about not knowing how to swim. I began to feel more at ease."
  #test_ids, test_masks = deberta_encode([essay], deberta_tokenizer)
  test_df = pd.DataFrame([essay], columns=['full_text'])
  test_df = test_df.replace(re.compile(r'[\n\r\t]'), '', regex=True)
  test_seq = lstm_tokenizer.texts_to_sequences([essay])
  pad_test = pad_sequences(test_seq, maxlen=1250, truncating='post')
  preds = lstm_model.predict(pad_test)
  y_preds=[]
  for row in preds:
    y_row = []
    for val in row:
      y_row.append(round(float(val)*2)/2.0)
    y_preds.append(y_row)

  return y_preds

In [24]:
def predict_essay_deberta(essay):
  if deberta_model is None:
    return "Model not yet initialized"
  #essay1 = "Learning something new can be a scary experience. One of the hardest things I've ever had to do was learn how to swim. I was always afraid of the water, but I decided that swimming was an important  skill  that  I  should  learn.  I  also  thought  it  would  be  good  exercise  and  help  me  to become physically stronger. What I didn't realize was that learning to swim would also make me a more confident person. New  situations  always  make  me  a  bit  nervous,  and  my  first  swimming  lesson  was  no exception. After I changed into my bathing suit in the locker room, I stood timidly by the side of the  pool  waiting  for  the  teacher  and  other  students  to  show  up.  After  a  couple  of  minutes  the teacher  came  over.  She  smiled  and  introduced  herself,  and  two  more  students  joined  us. Although they were both older than me, they didn't seem to be embarrassed about not knowing how to swim. I began to feel more at ease."
  test_ids, test_masks = deberta_encode([essay], deberta_tokenizer)
  preds = deberta_model.predict([test_ids,test_masks])
  y_preds=[]
  for row in preds:
    y_row = []
    for val in row:
      y_row.append(round(float(val)*2)/2.0)
    y_preds.append(y_row)

  return y_preds

In [56]:
!killall ngrok
app = Flask(__name__, template_folder=TEMPLATES_PATH, static_folder=STATIC_PATH)
ngrok.set_auth_token("2IDBGLav49FlSesCm8zbj4yd2SH_2d88VbwpKjrmxbwYNK1D1")
public_url = ngrok.connect(5000).public_url

In [57]:
@app.route("/")
def main():
    return render_template('index.html')

@app.route('/api/train_roberta')
def train_roberta_api():
  load_roberta()
  return "Roberta successfully loaded in the server"

@app.route('/api/test_roberta')
def test_roberta_api():
  mcrmse_score, scores = test_roberta()
  return {"mcrmse_score": mcrmse_score, "scores": scores}

@app.route('/api/predict_roberta_essay', methods = ['POST'])
def predict_roberta_essay_api():
  essay_json = request.get_json()
  return {"scores": predict_essay_roberta(essay_json['essay'])[0]}

In [58]:
@app.route('/api/train_lstm')
def train_lstm_api():
  load_lstm()
  return "LSTM successfully loaded in the server"

@app.route('/api/test_lstm')
def test_lstm_api():
  mcrmse_score, scores = test_lstm()
  return {"mcrmse_score": mcrmse_score, "scores": scores}

@app.route('/api/predict_lstm_essay', methods = ['POST'])
def predict_lstm_essay_api():
  essay_json = request.get_json()
  return {"scores": predict_essay_lstm(essay_json['essay'])[0]}

In [59]:
@app.route('/api/train_deberta')
def train_deberta_api():
  load_deberta()
  return "DeBERTa successfully loaded in the server"

@app.route('/api/test_deberta')
def test_deberta_api():
  mcrmse_score, scores = test_deberta()
  return {"mcrmse_score": mcrmse_score, "scores": scores}

@app.route('/api/predict_deberta_essay', methods = ['POST'])
def predict_deberta_essay_api():
  essay_json = request.get_json()
  return {"scores": predict_essay_roberta(essay_json['essay'])[0]}

In [60]:
print("Please click " + public_url)
app.run(port = 5000)

Please click http://02f7-35-204-65-71.ngrok.io
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:20] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:22] "[37mGET /static/js/main.df48d2c9.js HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:23] "[37mGET /static/css/main.3ebed550.css HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:23] "[37mGET /static/js/main.df48d2c9.js.map HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:24] "[37mGET /static/css/main.3ebed550.css.map HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:24] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:25] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:26] "[37mGET /static/css/main.3ebed550.css.map HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:26] "[37mGET /static/js/main.df48d2c9.js.map HTTP/1.1[

20


INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:29] "[37mGET /api/test_lstm HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:21:48] "[37mGET /api/test_roberta HTTP/1.1[0m" 200 -




INFO:werkzeug:127.0.0.1 - - [30/Nov/2022 05:22:18] "[37mGET /api/test_deberta HTTP/1.1[0m" 200 -
