<a href="https://colab.research.google.com/github/Angy-face/Algorithms/blob/main/POS_TAGGING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk,re,pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint,time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from collections import Counter

In [3]:
import nltk
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
tagged_sentences = nltk.corpus.treebank.tagged_sents(tagset='universal')

In [5]:
print("Number of Tagged Sentences ",len(tagged_sentences))
tagged_words=[tup for sent in tagged_sentences for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word, tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


In [6]:
sentences,tags = [],[]
for tagged_sentence in tagged_sentences:
  sentence,tag = zip(*tagged_sentence)
  sentences.append(np.array(sentence))
  tags.append(np.array(tag))
print(sentences[0],tags[0])

['Pierre' 'Vinken' ',' '61' 'years' 'old' ',' 'will' 'join' 'the' 'board'
 'as' 'a' 'nonexecutive' 'director' 'Nov.' '29' '.'] ['NOUN' 'NOUN' '.' 'NUM' 'NOUN' 'ADJ' '.' 'VERB' 'VERB' 'DET' 'NOUN' 'ADP'
 'DET' 'ADJ' 'NOUN' 'NOUN' 'NUM' '.']


In [7]:
sentence_train, sentence_test, tag_train, tag_test = train_test_split(sentences, tags, test_size=0.2, random_state=42)

In [8]:
words, tags = set([]), set([])

for s in sentence_train:
  for w in s:
    words.add(w.lower())

for ts in tag_train:
  for t in ts:
    tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0 # The special value used for padding
word2index['-OOV-'] = 1 # The special value used for OOVs

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-' ] = 0 # The special value used to padding

In [10]:
word2index,tag2index

({'12-year': 2,
  'automatic': 3,
  'raises': 4,
  'businesses': 5,
  'high-grade': 6,
  'frozen': 7,
  'pockets': 8,
  'heavy': 9,
  'refused': 10,
  'charged': 11,
  'funny': 12,
  'realestate': 13,
  'role': 14,
  'revised': 15,
  'lilly': 16,
  'orlando': 17,
  'collection': 18,
  'fundamentalist': 19,
  'motor-home': 20,
  '*t*-14': 21,
  'remorse': 22,
  'semiliterate': 23,
  '39': 24,
  'paint': 25,
  'syracuse': 26,
  'pattern': 27,
  'outcry': 28,
  '102': 29,
  'choice': 30,
  'rebel': 31,
  'labels': 32,
  'feared': 33,
  'salary': 34,
  'skills': 35,
  'grand': 36,
  'visited': 37,
  '1985': 38,
  'avon': 39,
  'code': 40,
  'n.v.': 41,
  'requirement': 42,
  'statute': 43,
  'candela': 44,
  'nagymaros': 45,
  'bancorp': 46,
  'republicans': 47,
  'retorts': 48,
  'had': 49,
  'appears': 50,
  'leming': 51,
  'package': 52,
  'spokesman': 53,
  'chatter': 54,
  '*t*-126': 55,
  'receipt': 56,
  'precedent': 57,
  'exuded': 58,
  'ask': 59,
  'capital': 60,
  'fla.': 61,
  

In [11]:
X_train, X_test, y_train, y_test = [], [], [], []

for sen in sentence_train:
  temp = []
  for w in sen:
    try:
      temp.append(word2index[w.lower()])
    except KeyError:
      temp.append(word2index['-OOV-'])
  X_train.append(temp)

for sen in sentence_test:
  temp = []
  for w in sen:
    try:
      temp.append(word2index[w.lower()])
    except KeyError:
      temp.append(word2index['-OOV-'])
  X_test.append(temp)

for s in tag_train:
    temp = []
    for t in s:
      temp.append(tag2index[t])
    y_train.append(temp)

for s in tag_test:
    temp = []
    for t in s:
      temp.append(tag2index[t])
    y_train.append(temp)

In [12]:
print(X_train[0])
print(y_train[0])

[4114, 4311, 4179, 560, 6279, 7934, 4179, 10065, 4743, 137, 1370, 5051, 383, 10010, 160, 342, 8552, 2514]
[8, 8, 2, 10, 8, 6, 2, 3, 3, 4, 8, 1, 4, 6, 8, 8, 10, 2]


In [13]:
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH)

271


In [24]:
import tensorflow as tf
import keras

In [25]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [28]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add( (Dense(len(tag2index))))
model.add(Activation('softmax'))
model.summary()



In [29]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

In [34]:
# def  to_categorical(sequences, categories):
#   cat_sequences = []
#   for s in sequences:
#     cats = []
#     for item in s:
#       cats.append(np.zeros(categories))
#       cats[-1][item] = 1.0
#     cat_sequences.append(cats)
#   return np.array(cat_sequences)

In [None]:
def to_categorical(sequences, categories, max_len=None):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)  # Find the maximum sequence length

    cat_sequences = []
    for seq in sequences:
        # Initialize a zero matrix of shape (max_len, categories)
        cats = np.zeros((max_len, categories))
        for idx, item in enumerate(seq):
            if idx < max_len:  # Ensure we don't exceed max_len
                cats[idx, item] = 1.0
        cat_sequences.append(cats)

    return np.array(cat_sequences)


In [38]:
cat_train_y = to_categorical(y_train, len(tag2index))
print(cat_train_y[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [43]:
model.fit(np.array(X_train),cat_train_y, batch_size=128, epochs=50, validation_split=0.2)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3131,) + inhomogeneous part.