In [1]:
!pip install transformers
!pip install py_vncorenlp

Collecting py_vncorenlp
  Downloading py_vncorenlp-0.1.4.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius (from py_vncorenlp)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: py_vncorenlp
  Building wheel for py_vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for py_vncorenlp: filename=py_vncorenlp-0.1.4-py3-none-any.whl size=4307 sha256=280923f4edca08a00dc9e3cfaadd6ba0fa365f6cce3da5374840d40ea78f6290
  Stored in directory: /root/.cache/pip/wheels/d5/d9/bf/62632cdb007c702a0664091e92a0bb1f18a2fcecbe962d9827
Successfully built py_vncorenlp
Installing collected packages: pyjnius, py_vncorenlp
Successfully installed py_vncorenlp-0.1.4 pyjnius-1.6.1


In [1]:
!pip install flask-ngrok
!pip install pyngrok==4.1.1
!ngrok authtoken 2Z7C0W3ySQ4MtLSvq5Y9OJ71yLT_2ebHsUsdRKkT8UMd8f35m

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [10]:
from google.colab import drive
drive.mount('./drive')

Drive already mounted at ./drive; to attempt to forcibly remount, call drive.mount("./drive", force_remount=True).


In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [12]:
import re
import py_vncorenlp

py_vncorenlp.download_model(save_dir='./')
segment_model = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')

VnCoreNLP model folder . already exists! Please load VnCoreNLP from this folder!


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/DS105/model_save')
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/DS105/model_save')

In [14]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030" "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_special_characters(text):
    pattern = r"[^\w\s.,;:?]"
    return re.sub(pattern, "", text)

def from_logit_to_label(logit):
  return 'Positive' if logit == 0 else 'Negative'

In [15]:
def classification_review(text: list, tokenizer=tokenizer, model=model):
  input_ids, attention_masks = [], []
  sentences = text.copy()

  for i in range(len(text)):
    text[i] = text[i].replace('\n', '. ')
    text[i] = remove_emoji(text[i])
    text[i] = remove_special_characters(text[i])
    text[i] = segment_model.word_segment(text[i])[0]

    encoded_dict = tokenizer.encode_plus(
        text[i],
        max_length=100,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  dataset = TensorDataset(input_ids, attention_masks)
  dataloader = DataLoader(dataset, shuffle=False, batch_size=32)

  model.eval()

  for batch in dataloader:
    b_input_ids, b_attention_mask = batch

    with torch.no_grad():
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_attention_mask)

  logits = result.logits.softmax(dim=1)

  labels_out = logits.argmax(dim=1)

  # for i, sent in enumerate(sentences):
  #   print("{} : {}".format(sent, from_logit_to_label(labels_out[i])))

  return labels_out, logits

In [18]:
res = classification_review(['Tai nghe kém'])

In [24]:
res[1][0][res[0]]

tensor([0.8998])

In [29]:
from flask import Flask, request, render_template
from flask_ngrok import run_with_ngrok

app = Flask(__name__, template_folder='/content/drive/MyDrive/DS105')
run_with_ngrok(app)

@app.get('/')
def home():
  return render_template('template.html')

@app.post('/')
def get_res():

  text = request.form.get('text-input')
  res = classification_review([text])
  label = from_logit_to_label(res[0].item())
  conf = float('{:.2f}'.format(res[1][0][res[0]].item() * 100))

  return render_template('template.html',
                         review=text,
                         label=label,
                         conf=conf)

app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://2ef1-35-243-209-83.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [05/Dec/2023 09:39:24] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Dec/2023 09:39:25] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Dec/2023 09:39:39] "POST / HTTP/1.1" 200 -
