In [85]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
from nltk.tag import DefaultTagger
from gensim.models import FastText

In [86]:
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [93]:
from conllu import parse

# Load the CoNLL-U file
with open('hi_hdtb-ud-train.conllu', 'r', encoding='utf-8') as f:
    data = f.read()

# Parse the CoNLL-U data
sentences = parse(data)

# Iterate through sentences and tokens
a = []
for sentence in sentences:
    b = []
    for token in sentence:
        word = token['form']  # Get the word
        lemma = token['lemma']  # Get the lemma
        pos = token['upos']  # Get the universal part-of-speech tag
        head = token['head']  # Get the head index
        dep = token['deprel']  # Get the dependency label
        b.append(tuple((word,pos)))
        # Print or process the extracted information as needed
        #print(f"Word: {word}, Lemma: {lemma}, POS: {pos}, Head: {head}, Dep: {dep}")
    a.append(b)
        


[('यह', 'DET'), ('एशिया', 'PROPN'), ('की', 'ADP'), ('सबसे', 'ADV'), ('बड़ी', 'ADJ'), ('मस्जिदों', 'NOUN'), ('में', 'ADP'), ('से', 'ADP'), ('एक', 'NUM'), ('है', 'AUX'), ('।', 'PUNCT')]


In [98]:
print(a[17])

[('यहाँ', 'PRON'), ('आदिवासी', 'NOUN'), (',', 'PUNCT'), ('समुद्र', 'NOUN'), ('किनारे', 'NOUN'), (',', 'PUNCT'), ('रेगिस्तान', 'NOUN'), ('और', 'CCONJ'), ('हिमालय', 'PROPN'), ('के', 'ADP'), ('आवासों', 'NOUN'), ('के', 'ADP'), ('नमूने', 'NOUN'), ('भी', 'PART'), ('बनाए', 'VERB'), ('गए', 'AUX'), ('हैं', 'AUX'), ('।', 'PUNCT')]


In [99]:
nltk.download("indian")
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [100]:
tagged_sentences = indian.tagged_sents('hindi.pos')

brown_corpus_sent = list(nltk.corpus.brown.tagged_sents(tagset='universal'))
brown_corpus_sent = tagged_sentences
brown_corpus_sent = a
print(len(brown_corpus_sent))
print(brown_corpus_sent[17])

13306
[('यहाँ', 'PRON'), ('आदिवासी', 'NOUN'), (',', 'PUNCT'), ('समुद्र', 'NOUN'), ('किनारे', 'NOUN'), (',', 'PUNCT'), ('रेगिस्तान', 'NOUN'), ('और', 'CCONJ'), ('हिमालय', 'PROPN'), ('के', 'ADP'), ('आवासों', 'NOUN'), ('के', 'ADP'), ('नमूने', 'NOUN'), ('भी', 'PART'), ('बनाए', 'VERB'), ('गए', 'AUX'), ('हैं', 'AUX'), ('।', 'PUNCT')]


In [101]:
print(brown_corpus_sent[0])
print("Total sentences with tags: ", len(brown_corpus_sent))
print("total tagged words:", len(nltk.corpus.brown.tagged_words()))
brown_corpus_sent[0][1]

[('यह', 'DET'), ('एशिया', 'PROPN'), ('की', 'ADP'), ('सबसे', 'ADV'), ('बड़ी', 'ADJ'), ('मस्जिदों', 'NOUN'), ('में', 'ADP'), ('से', 'ADP'), ('एक', 'NUM'), ('है', 'AUX'), ('।', 'PUNCT')]
Total sentences with tags:  13306
total tagged words: 1161192


('एशिया', 'PROPN')

#### Pre processing

In [102]:
sentences, sentence_tags =[], [] 
for tagged_sentence in brown_corpus_sent:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))


In [103]:
num_sents = len(sentences)
k = 5
foldsize = int(num_sents/5)


k_folds = {}
for i in range(5):
    # Locate the test set in the fold.
    k_folds["test_sent{0}".format(i)] = sentences[i*foldsize:i*foldsize+foldsize]
    k_folds["train_sent{0}".format(i)] = sentences[:i*foldsize] + sentences[i*foldsize+foldsize:]
    k_folds["test_tags{0}".format(i)] = sentence_tags[i*foldsize:i*foldsize+foldsize]
    k_folds["train_tags{0}".format(i)] = sentence_tags[:i*foldsize] + sentence_tags[i*foldsize+foldsize:]

In [104]:
training_sentences, test_sentences, training_tags, test_tags = k_folds['train_sent4'],k_folds['test_sent4'],k_folds['train_tags4'],k_folds['test_tags4']

In [105]:
vocab = set([])
unique_tags = set([])
for sent in training_sentences:
    for word in sent:
        vocab.add(word.lower())
for sent_tag in training_tags:
    for tag in sent_tag:
        unique_tags.add(tag)

In [106]:
print(len(unique_tags),len(vocab))

16 15078


In [107]:
#Converting words to integer and adding pad and unkonwn words as integer as 0 and 1
word2int = {word: i + 2 for i, word in enumerate(list(vocab))}
word2int['-PAD-'] = 0
word2int['-UNK-'] = 1 
#print(word2int)
tag2int = {tag: i + 1 for i, tag in enumerate(list(unique_tags))}
tag2int['-PAD-'] = 0  

In [108]:
#word2int['were'], tag2int["ADJ"]

In [109]:
#Convert test and trainig dataset to integer
X_train, X_test, Y_train, Y_test = [],[],[],[]
for sent in training_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_train.append(sent_to_int)

for sent in test_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_test.append(sent_to_int)
    
    
for sent_tag in training_tags:
    Y_train.append([tag2int[tag] for tag in sent_tag])
    
for sent_tag in test_tags:
    Y_test.append([tag2int[tag] for tag in sent_tag])

In [110]:
print(X_train[0],"\n",Y_train[0])
print(X_test[0],"\n",Y_test[0])

[1440, 9764, 2106, 8158, 9778, 4348, 5467, 14915, 2093, 5355, 12528] 
 [11, 13, 14, 9, 10, 6, 14, 14, 7, 4, 2]
[11984, 2088, 14859, 449, 2011, 14972, 8513, 2974, 11609, 1532, 12437, 14915, 8316, 3649, 70, 5228, 2106, 14341, 984, 3769, 9362, 5028, 7572, 9889, 12528] 
 [13, 11, 6, 14, 12, 6, 14, 6, 14, 10, 6, 14, 12, 6, 8, 15, 14, 6, 15, 14, 6, 15, 4, 4, 2]


In [111]:
#Now we add padding to every sentence for equalling the length to max length of sentences
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH) 


116


In [112]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
Y_train = pad_sequences(Y_train, maxlen=MAX_LENGTH, padding='post')
Y_test = pad_sequences(Y_test, maxlen=MAX_LENGTH, padding='post')
 
#print(X_train[0])
#print(X_test[0])
#print(Y_train[0])
#print(Y_test[0])

### Model Building using keras

In [113]:
#we will define the accuracy without consideration of padding beacause if not then accuracy will be high due to 
#prediction of padding tags

from keras import backend as K
 
def no_pad_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [114]:
#Now we define the network architecture
#we will use the word embeddins for word to vector
#we have used Bidirectional LSTM, and model return a sequence

#first import library

from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [115]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2int), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2int))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy',no_pad_accuracy(0)])
 
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 116, 128)          1930240   
                                                                 
 bidirectional_2 (Bidirecti  (None, 116, 512)          788480    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 116, 17)           8721      
 stributed)                                                      
                                                                 
 activation_2 (Activation)   (None, 116, 17)           0         
                                                                 
Total params: 2727441 (10.40 MB)
Trainable params: 2727441 (10.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [116]:
#Convert to one hot vector

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
 

In [117]:
cat_train_tags_y = to_categorical(Y_train, len(tag2int))


In [118]:
cat_train_tags_y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

### Model Training

In [119]:
model.fit(X_train, to_categorical(Y_train, len(tag2int)), batch_size=128, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1824b21a0e0>

In [120]:
scores = model.evaluate(X_test, to_categorical(Y_test, len(tag2int)))
print(f"{model.metrics_names[2]}: {scores[2] * 100}")   

ignore_accuracy: 94.12344098091125


### Sample Example try

In [121]:
test_samples = [
    "we will be having a quiz on monday .".split(),
    "I am very big fan of lionel messi's playing style .".split()
]
print(test_samples)

[['we', 'will', 'be', 'having', 'a', 'quiz', 'on', 'monday', '.'], ['I', 'am', 'very', 'big', 'fan', 'of', 'lionel', "messi's", 'playing', 'style', '.']]


In [125]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2int[w.lower()])
        except KeyError:
            s_int.append(word2int['-UNK-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[   1    1    1    1    1    1    1    1 9536    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   1    1    1    1    1    1    1    1    1    1 9536    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0

In [126]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        no_pad_tokens=[]
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)
    return token_sequences

In [127]:
predictions = model.predict(test_samples_X)
tag_prediction = (logits_to_tokens(predictions, {i: t for t, i in tag2int.items()}))



In [128]:
#for converting output tag sequence to without "-PAD" tag, if no labels given
def pred_no_pad_without_labels(predictions):
  final_pred = []
  for tag_sent in predictions:
    tag_sent_no_pad = []
    for tags in tag_sent:
      if tags == "-PAD-":
        break
      else:
        tag_sent_no_pad.append(tags)
    final_pred.append(tag_sent_no_pad)
  return final_pred

In [129]:
final_pred = pred_no_pad_without_labels(tag_prediction)
print(final_pred)

[['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT'], ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT']]


In [130]:
#for converting output tag sequence to without "-PAD" tag, if labels given
def pred_no_pad_with_labels(predictions,test_tags):
  final_pred = []
  for i in range(len(predictions)):
    k = min(len(test_tags[i]),len(predictions[i]))
    tag_sent_no_pad = []
    for j in range(k):
      tag_sent_no_pad.append(predictions[i][j])
    final_pred.append(tag_sent_no_pad)
  return final_pred

### Prediction on test data

In [131]:
test_pred = model.predict(X_test)
test_tag_pred = logits_to_tokens(test_pred, {i: t for t, i in tag2int.items()})
test_tag_pred_no_pad = pred_no_pad_with_labels(test_tag_pred,test_tags)



In [132]:
#for counting if the length of test tag sequneces and predicted tag sequences are equall or not
t=0
for i in range(len(test_tags)):
  for j in range(len(test_tag_pred_no_pad)):
    if i==j:
      if len(test_tags[i])!=len(test_tag_pred_no_pad[i]):
        print(i)
        t+=1
print(t)

0


In [133]:
d = {"true_labels": test_tags, "predicted_labels":test_tag_pred_no_pad}
pd_pred = pd.DataFrame(data=d)

In [134]:
#for confusion matrix we make a one list for true labels and predicted labels each, 
test_words = []
y_true=[]
y_pred=[]
for sent in test_tags:
  for tag in sent:
    y_true.append(tag)

for sent in test_tag_pred_no_pad:
  for tag in sent:
    y_pred.append(tag)

for sent in test_sentences:
  for word in sent:
    test_words.append(word)

In [135]:
d5th = {"words":test_words,"true_label":y_true,"predicted_lable":y_pred}
dataframe_5th = pd.DataFrame(data=d5th)

In [136]:
dataframe_5th.head()

Unnamed: 0,words,true_label,predicted_lable
0,प्रधानमंत्री,PROPN,PROPN
1,इस,DET,DET
2,मुद्दे,NOUN,NOUN
3,पर,ADP,ADP
4,अपने,PRON,PRON


In [138]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_true, y_pred, target_names=list(unique_tags)+['-PAD-']))



Classification Report



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           X       0.00      0.00      0.00         0
       PUNCT       0.87      0.90      0.89      3792
        INTJ       0.99      0.99      0.99     12135
         AUX       0.83      0.72      0.77       555
       CCONJ       0.97      0.99      0.98      3776
        NOUN       0.96      0.97      0.97      1121
         NUM       0.95      0.92      0.94      1277
        PART       0.00      0.00      0.00         2
         ADV       0.88      0.95      0.91     12770
         ADJ       0.97      0.88      0.92      1125
         DET       0.98      0.95      0.96      1150
        PRON       0.96      0.96      0.96      2338
       PROPN       0.91      0.82      0.86      7061
         ADP       1.00      1.00      1.00      3875
        VERB       0.97      0.98      0.98      1123
       SCONJ       0.98      0.95      0.96      5893
       -PAD-       0.50      0.03      0.05        35

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [139]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_true, y_pred,labels=list(unique_tags)+['-PAD-'])
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[    1     0     0     0     0    20     2     0     0     3     0     0
      1     7     0     0     1]
 [    0  3873     0     0     0     1     0     0     0     0     0     0
      1     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      2     0     0     0     0]
 [    0     0     0  3720     0     0     0     0     0     0     0     0
      0     1    55     0     0]
 [    0     0     0     0  1090     0     0     0     0     0     1     0
      4    26     0     0     0]
 [    0     0     0     1     3 12074     9     5     6   197     3    12
    419    15    26     0     0]
 [    0     0     0     1     0    41   994     0     0    53     0     1
     32     1     1     0     1]
 [    0     0     0     0     0     4     0  1091    10     2     0     5
      1    22     1    14     0]
 [    0     0     0     0     2    19     1     8   399    28     8     7
      8    67     6     1     1]
 [    1     0     0

In [140]:
pd_confusion = pd.DataFrame((confusion), columns=list(unique_tags)+['-PAD-'],index=list(unique_tags)+['-PAD-'])

In [141]:
pd_confusion

Unnamed: 0,X,PUNCT,INTJ,AUX,CCONJ,NOUN,NUM,PART,ADV,ADJ,DET,PRON,PROPN,ADP,VERB,SCONJ,-PAD-
X,1,0,0,0,0,20,2,0,0,3,0,0,1,7,0,0,1
PUNCT,0,3873,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
INTJ,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0
AUX,0,0,0,3720,0,0,0,0,0,0,0,0,0,1,55,0,0
CCONJ,0,0,0,0,1090,0,0,0,0,0,1,0,4,26,0,0,0
NOUN,0,0,0,1,3,12074,9,5,6,197,3,12,419,15,26,0,0
NUM,0,0,0,1,0,41,994,0,0,53,0,1,32,1,1,0,1
PART,0,0,0,0,0,4,0,1091,10,2,0,5,1,22,1,14,0
ADV,0,0,0,0,2,19,1,8,399,28,8,7,8,67,6,1,1
ADJ,1,0,0,0,5,241,2,1,18,3427,6,0,63,7,21,0,0
