In [43]:
import json
from src.bengali_analyzer.bengalianalyzer import *

bengali_analyzer = BengaliAnalyzer()

In [44]:
ENCODING = "utf-8"
LOG_FILE_NAME = "info.json"
LOG_FILE_LOCATION = "./" + LOG_FILE_NAME
SIMPLE_RESPONSE_LOCATION = "./" + "simpleResponse.json"

def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)
    return obj

def updateLog(text):
    with open(LOG_FILE_LOCATION, "w", encoding=ENCODING) as f:
        json.dump(text, f, ensure_ascii=False, default=serialize_sets, indent=4)

def simplifyJson():
    with open(LOG_FILE_LOCATION, "r", encoding=ENCODING) as f:
        data = json.load(f)
    for x in data.copy():
        # Numeric
        dataNumeric = data[x]["Numeric"]
        for y in dataNumeric.copy():
            if(not dataNumeric[y]):
                del(data[x]["Numeric"][y])
                ok = 1
        if(not dataNumeric):
            del(data[x]["Numeric"])
        
        # Punctuation
        dataPunction = data[x]["Punctuation_Flag"]
        if(not dataPunction):
            del(data[x]["Punctuation_Flag"])
        # Verb

        dataVerb = data[x]["Verb"]
        for y in dataVerb.copy():
            if(not dataVerb[y] or (y=='Emphasizer' and not dataVerb[y][0])):
                del(data[x]["Verb"][y])
                ok = 1
        if(not dataVerb):
            del(data[x]["Verb"])

        # Pronoun
        dataPronoun = data[x]["Pronoun"]
        for y in dataPronoun.copy():
            if(not dataPronoun[y]):
                del(data[x]["Pronoun"][y])
        if(not dataPronoun):
            del(data[x]["Pronoun"])

        # Pos
        if(not data[x]["PoS"]):
            del(data[x]["PoS"])

        # Composite Word
        dataComposite = data[x]["Composite_Word"]
        for y in dataComposite.copy():
            if(not dataComposite[y]):
                del(data[x]["Composite_Word"][y])
        if(not dataComposite):
            del(data[x]["Composite_Word"])

        # Special Entity
        dataSpecial = data[x]["Special_Entity"]
        for y in dataSpecial.copy():
            if(not dataSpecial[y]):
                del(data[x]["Special_Entity"][y])
        if(not dataSpecial):
            del(data[x]["Special_Entity"])
    with open(SIMPLE_RESPONSE_LOCATION, "w", encoding=ENCODING) as f:
        json.dump(data, f, ensure_ascii=False, default=serialize_sets, indent=4)
    return data


In [45]:
# sentence = 'থাকবে আউলাই শুনতে থাকবে আউলানো'
# sentence = 'থাকবে আউলাই শুনতেই থাকবে আউলানো'
# sentence = 'আমার কথা শুনতে শুনতে তুমি ঢাকায় থাকবে'
# sentence = 'আমার কথা শুনতেই শুনতেই তুমি ঢাকায় থাকবেও দেও'
# sentence = "আমি ভাত খাবই"
# sentence = "কুত্তা কামড়াইলে মানুষ মারা যায়"
#sentence = "ঘুড়ি ওড়াই"
# sentence = "ভাতের আউলাইয়া শোবেনই"
# sentence = "বড় দাদা ঘুড়ি উড়াই"

# sentence = "জামালকে জামাল গঘরজামাই"

# sentence = "ভাত মাখাইয়া দিয়ে টাকা হাতিয়েই নিলে"
# sentence = "যেতে পথে যেতে সে ভাত মাখাইয়াই দিলেই খেতে থাকব"
sentence = "যেতে পথে যেতে সে ভাত মাখাইয়াই দিলেই খেতে থাকব, যেতে "
#sentence = "যেতে পথে যেতে"
#sentence = "অকঠোর"


# sentence = "বঙ্গবন্ধু শেখ মুজিব মেডিকেল কলেজ"

# sentence = "বায়ান্নহাজার"

res = bengali_analyzer.analyze_sentence(sentence)
updateLog(res)
res = simplifyJson()
res

{'যেতে': {'Global_Index': [[0, 3], [9, 12], [47, 50]],
  'Verb': {'Parent_Verb': 'যাওয়া',
   'Emphasizer': ['ই'],
   'TP': [{'tense': 'no', 'person': 'tm'}],
   'Non_Finite': True,
   'Related_Indices': [[9, 12]],
   'Language_Form': 'standard'},
  'Composite_Word': {'Stand_Alone_Words': ['যেতে']}},
 'পথে': {'Global_Index': [[5, 7]],
  'Composite_Word': {'Suffix': 'ে', 'Stand_Alone_Words': ['পথ']}},
 'সে': {'Global_Index': [[14, 15]],
  'Pronoun': {'Pronoun Tag': 'Pro.Pers3.CoRel',
   'Number Tag': 'Sing',
   'Honorificity': 'informal',
   'Case': 'genitive',
   'Proximity': 'distal'}},
 'ভাত': {'Global_Index': [[17, 19]], 'PoS': ['বিশেষণ', 'বিশেষ্য']},
 'মাখাইয়াই': {'Global_Index': [[21, 28]],
  'Verb': {'Parent_Verb': 'মাখানো',
   'Emphasizer': ['ই'],
   'Non_Finite': True,
   'Language_Form': 'standard'},
  'Composite_Word': {'Suffix': 'ই',
   'Stand_Alone_Words': ['ইয়া', 'মাখা', 'মাখাইয়া']}},
 'দিলেই': {'Global_Index': [[30, 34]],
  'Verb': {'Parent_Verb': 'দেওয়া',
   'Emphasizer':

In [46]:
def sortFunc(word):
  return word["index"]


In [53]:
def getPos():
    bangla_pos_to_english_pos = {
        'বিশেষণ': 'Adjective',
        'বিশেষ্য': 'Noun'
    }

    word_objects = []
    pos_list = []
    for word in res:
        body = res[word]
        pos = ['undefined']

        if 'Verb' in body:
            pos = ['Verb']

            if "TP" in body["Verb"]:
                pos.append('Finite')

            if "Non_Finite" in body["Verb"] and body["Verb"]["Non_Finite"] == True:
                pos.append('Non-Finite')

        elif 'Pronoun' in body:
            pos = ['Pronoun']

        elif 'Punctuation_Flag' in body and body['Punctuation_Flag'] == True:
            pos = ['Punctuation']
            print('hit')

        elif 'PoS' in body:
            t = []
            for p in body["PoS"]:
                t.append(bangla_pos_to_english_pos[p])
            pos = t

        indexes = body['Global_Index']

        for index in indexes:
            if type(index) is list:
                word_objects.append({'pos': pos, 'index': index[0]})
            else:
                print(pos)
                word_objects.append({'pos': pos, 'index': index})

    word_objects.sort(key=sortFunc)

    for entry in word_objects:
        pos_list.append(entry['pos'])

    return pos_list


In [54]:
l = getPos()
l
# যেতে পথে যেতে সে ভাত মাখাইয়াই দিলেই খেতে থাকব, যেতে


hit
['Punctuation']


[['Verb', 'Finite', 'Non-Finite'],
 ['undefined'],
 ['Verb', 'Finite', 'Non-Finite'],
 ['Pronoun'],
 ['Adjective', 'Noun'],
 ['Verb', 'Non-Finite'],
 ['Verb', 'Finite', 'Non-Finite'],
 ['Verb', 'Finite', 'Non-Finite'],
 ['Verb', 'Finite'],
 ['Punctuation'],
 ['Verb', 'Finite', 'Non-Finite']]