In [1]:
import os 
import pandas as pd
from utils.data import *
from tqdm.auto import tqdm
tqdm.pandas()
#-------------
# globals
#-------------
src_dir                 =   os.path.join(os.getcwd(),"tests","ILMT_TAGSET_TEST")
model_path              =   os.path.join(src_dir,"keras_mlp_bangla.h5")
tagged_data_path        =   os.path.join(src_dir,"nlp_data_bn_tagged_mod.txt")
untaggeed_data_path     =   os.path.join(src_dir,"untagged.bn.txt")


# Data EDA

In [2]:
EDA=True
if EDA:
    #----------------
    # dataset EDA
    #---------------
    # get dataFrame
    df=textfile_to_dataset(tagged_data_path,eda=True)
    # get wtc
    words=[]
    tags=[]
    counts=[]
    for word in tqdm(df.word.unique()):
        words.append(word)
        _wdf=df.loc[df.word==word]
        tags.append(_wdf.tag.unique())
        counts.append(len(_wdf))
    # format and save dataframe
    dfs=pd.DataFrame({"word":words,"tags":tags,"count":counts})
    dfs = dfs.sort_values(by=['count'], ascending=False)
    dfs.to_csv(os.path.join(src_dir,"tagged_data_wtc.csv"),index=False)



  0%|          | 0/2927 [00:00<?, ?it/s]

  0%|          | 0/12514 [00:00<?, ?it/s]

In [3]:
dfs

Unnamed: 0,word,tags,count
13,৷,[PU],2344
7,",","[PU, PP]",1427
10263,।,[PU],400
2,-,"[PU, RDX]",370
30,না,"[CX, CCD]",359
...,...,...,...
5911,অসাধ্য,[NC],1
5909,পলায়ন,[NC],1
5905,হিমানী,[NP],1
5902,লাইনে,[NC],1


# Vectorizing Data and Encoding Labels

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
#----------------
# dataset 
#---------------
# get features and classes
X,y=textfile_to_dataset(tagged_data_path,eda=False)
# create dict vectorizer
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X)
# create label encoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
print("Done!")

  0%|          | 0/2927 [00:00<?, ?it/s]

  0%|          | 0/2927 [00:00<?, ?it/s]

Done!


# Model Analysis

In [5]:
import keras
model=keras.models.load_model(model_path)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               24136192  
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 36)               

# Sample Test

In [6]:
from utils.tagger import BanglaPOSTagger
tagger=BanglaPOSTagger(model,label_encoder,dict_vectorizer)


In [7]:
# test
sentence = 'দুশ্চিন্তার কোন কারণই নাই'
tagger.tag(sentence)

[('দুশ্চিন্তার', 'NC'), ('কোন', 'JQ'), ('কারণই', 'NC'), ('নাই', 'CX')]

# Untagged Data

In [8]:
# read the untagged_data
with open(untaggeed_data_path, encoding='utf8') as f:
    # read lines
    texts=f.readlines()


In [24]:
import string
existing_words=dfs.word.tolist()
sentences=[]
results=[]
non_found_sentences=[]

for line in tqdm(texts):
    line=line.replace("\n","")
    try:        
        results.append(tagger.tag(line))
        sentences.append(line)
    except Exception as e:
        non_found_sentences.append(line)
            

  0%|          | 0/1000 [00:00<?, ?it/s]

In [25]:
len(non_found_sentences)

623

In [26]:
res=pd.DataFrame({"sentence":sentences,"predictions":results})
res

Unnamed: 0,sentence,predictions
0,তুরুপ,"[(তুরুপ, NC)]"
1,পদদলিত করা,"[(পদদলিত, JJ), (করা, NV)]"
2,পোর্ট কোম্পানি লিমিটেড - কেপিসিএল,"[(পোর্ট, NC), (কোম্পানি, NP), (লিমিটেড, NP), (..."
3,তোলপাড়,"[(তোলপাড়, PPR)]"
4,ক্লাব.,"[(ক্লাব, NC), (., PWH)]"
...,...,...
372,গ্রাম,"[(গ্রাম, NC)]"
373,রাজনীতিবিদ,"[(রাজনীতিবিদ, NC)]"
374,সুযোগের স্বব্যবহার করা,"[(সুযোগের, NC), (স্বব্যবহার, NC), (করা, NV)]"
375,গৃহধূম,"[(গৃহধূম, NP)]"


In [32]:
col_fmt='<span style="color:green"><em>{}</em></span>'
with open('ilmt_test_results.md', 'a') as f:
    for pred in tqdm(res.predictions.tolist()):
        line='<p>'
        # line
        for tup in pred:
            line+=tup[0]+' '+col_fmt.format(tup[1])+' '
        line+='<p>\n'
        f.write(line)

  0%|          | 0/377 [00:00<?, ?it/s]