In [1]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [8]:
import stanza
stanza.download("hi")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [11]:
from tensorflow.keras.models import load_model
from objs import max_len, pred_map, basic_pos_to_vector, pos_to_basic_pos
import numpy as np
import string
import stanza
import spacy
import logging


def is_hindi(token):
    return all("\u0900" <= char <= "\u097F" for char in token)

def get_pos_tags(sentence):

    st_logger = logging.getLogger("stanza")
    st_logger.setLevel(logging.ERROR)

    nlp_en = spacy.load("en_core_web_sm")
    nlp_hi = stanza.Pipeline("hi", download_method=None)

    tokens = sentence.split()
    tokens = [token.strip(string.punctuation + chr(2404)) for token in tokens] # chr(2404) = Hindi fullstop (।)
    pos_sequence = []

    for token in tokens:
        if is_hindi(token):
            doc = nlp_hi(token)
            pos_sequence.append(pos_to_basic_pos.get(doc.sentences[0].words[0].upos, "other"))
        else:
            doc = nlp_en(token)
            pos_sequence.extend([pos_to_basic_pos.get(token.pos_, "other") for token in doc])

    del nlp_en, nlp_hi

    return pos_sequence

def predict(input):

    pos_sequence = get_pos_tags(input)

    vectorized_pos_seq = [basic_pos_to_vector[token] for token in pos_sequence]

    if len(vectorized_pos_seq) < max_len:
        padding = [[0] * 9] * (max_len - len(vectorized_pos_seq))
        vectorized_pos_seq += padding
    else:
        vectorized_pos_seq = vectorized_pos_seq[:max_len]

    inp = np.array([vectorized_pos_seq])

    model = load_model("dominant_language_model_2.keras")
    pred = list(model.predict(inp, verbose=0)[0])

    return pred_map[pred.index(max(pred))]


# predict("यह जगह बहुत essential है।")

In [12]:
predict("यह जगह बहुत essential है।")



'Hindi'