## Import libraries

In [0]:
import torch
import pickle
import unicodedata
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

## Mount Driver

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/My\ Drive

/content/drive/My Drive


In [4]:
!ls

'10 gettogether'
 16719
'7. content based recommendation_system.ipynb'
'About leapfrog.gdoc'
 bq-results-20190425-142055-kzmrasjrjqny
 cern.gdoc
 cern.jpg
 cmeriwork
'Colab Notebooks'
 Cover_letter_Anish_Pandey.pdf
 dataset
 Digital_Image_Processing_2ndEd.pdf
 Documents
 Hell
'HR profiling'
 import_docx
 in.gov.uidai-ADHAR-895156698533.pdf
 Journal_reccomendation
 Kawita.gdoc
'list for user_classification'
 Nepali_News_Classification.csv
'Resume_Anish (1).pdf'
 Resume_Anish.pdf
'Resume_Coverletter_Anish_Pandey - Anish Pandey.pdf'
 Resume_Coverletter_Anish_Pandey.pdf
 Statement_Excel_2019-08-03.xls
'top 10 list.gdoc'


## import dataset

In [0]:
dbfile = open('dataset/processed_tag', 'rb')

In [0]:
data_df = pickle.load(dbfile)

In [7]:
data_df.head()

Unnamed: 0,text,tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


In [0]:
data_df['len_txt'] =data_df['text'].map(len)

In [0]:
data_df['len_tag'] =data_df['tag'].map(len)

In [10]:
data_df[data_df['len_txt']!=data_df['len_tag']]

Unnamed: 0,text,tag,len_txt,len_tag


## hence from the operation, we figure out that there is not mismatch between tags and text tokens 

## Dataset Preparation

In [0]:
data_df = data_df[data_df['len_txt']<100]

In [12]:
data_df.head()

Unnamed: 0,text,tag,len_txt,len_tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,...",16,16
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ...",11,11
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ...",25,25
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,...",43,43
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>...",38,38


In [0]:
MAX_LEN = max(data_df['len_txt'])

In [14]:
data_df[data_df['len_txt']==MAX_LEN]

Unnamed: 0,text,tag,len_txt,len_tag
1845,"[लङवुड, ,, फ्लोरिडा, का, भिक्टर, स्टान्, ल...","[<NNP>, <YM>, <NNP>, <PKO>, <NNP>, <NNP>, <PLE...",91,91


In [0]:
data_df['text'] = data_df['text'].apply(lambda x : ["".join(item.split(" ")) for item in x])

## Converting Unicode value into ASCII

In [0]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [0]:
data_df['ascii_text'] = data_df['text'].apply(lambda x : [unicode_to_ascii(item) for item in x])

In [0]:
data_df.drop(columns=['text','len_txt','len_tag'],inplace=True)

In [0]:
data_df = data_df[['ascii_text','tag']]

In [20]:
data_df.head()

Unnamed: 0,ascii_text,tag
0,"[६१, वरषीय, पियर, भिनकन, नोभमबर, २९, बाट, सलला...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[शरी, भिनकन, डच, परकाशन, समह, एलसभियर, एन.भी.,...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कनसोलिडटिड, गोलड, फिलडस, पीएलसी, का, परव, सभा...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, कनट, चरोट, को, फिलटर, बनाउन, परयोग, भ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सग, को, छोटो, समपरक, बाट, मातर, पनि, दशकौ...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


## Padding

In [0]:
def padding_sequence(sequence, max_length = MAX_LEN):
  true_length = len(sequence)
  temp_sequence = sequence
  for idx in range(true_length,max_length):
    temp_sequence.append('<PAD>')
  return temp_sequence

In [0]:
data_df['ascii_text'] = data_df['ascii_text'].map(padding_sequence)

In [0]:
data_df['tag'] = data_df['tag'].map(padding_sequence)

## word2vec and vec2word conversion

In [0]:
word_dictionary = list(set(sum(data_df['ascii_text'].tolist(),[])))

In [25]:
word_dictionary[:8]

['', 'समझना', 'मायनस', 'मनदपरहार', 'खरिदआदश', 'सउदी', 'बताइन', 'तीन-चौथाइ']

In [0]:
word_dict_len = len(word_dictionary)

In [27]:
word_dict_len

11915

In [0]:
tag_dictionary = list(set(sum(data_df['tag'].tolist(),[])))

In [29]:
tag_dictionary[:8]

['<VBX>', '<PAD>', '<CC>', '<YQ>', '<NNP>', '<PP>', '<CL>', '<YF>']

In [0]:
tag_dict_len = len(tag_dictionary)

In [0]:
int2word = dict(enumerate(word_dictionary))

In [0]:
word2int = {int2word[idx]:idx for idx in int2word.keys() }

In [0]:
int2tag = dict(enumerate(tag_dictionary))

In [0]:
tag2int = {int2tag[idx]:idx for idx in int2tag.keys()}

In [0]:
total_length = data_df.shape[0]

# Convert Dataset into numerical vector

In [36]:
data_df.head()

Unnamed: 0,ascii_text,tag
0,"[६१, वरषीय, पियर, भिनकन, नोभमबर, २९, बाट, सलला...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[शरी, भिनकन, डच, परकाशन, समह, एलसभियर, एन.भी.,...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कनसोलिडटिड, गोलड, फिलडस, पीएलसी, का, परव, सभा...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, कनट, चरोट, को, फिलटर, बनाउन, परयोग, भ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सग, को, छोटो, समपरक, बाट, मातर, पनि, दशकौ...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


In [0]:
feature = data_df['ascii_text'].tolist()

In [0]:
label = data_df['tag'].tolist()

In [0]:
x=[]
y=[]
for idx in range(data_df.shape[0]):
  x.append([word2int[word] for word in feature[idx]])
  y.append([tag2int[tag] for tag in label[idx]])  

In [0]:
from torch.utils.data import DataLoader, TensorDataset

In [0]:
import numpy as np

In [0]:
x = torch.from_numpy(np.array(x))

In [0]:
y = torch.from_numpy(np.array(y))

In [0]:
dataset = TensorDataset(x,y)

In [0]:
BATCH_SIZE = 64

In [0]:
dataset_dl = DataLoader(dataset,batch_size=BATCH_SIZE, shuffle=True)

In [0]:
class POSTag(nn.Module):

  def __init__(self,vocab_size, embed_dim, hidden_dim, num_layer, output_dim, dropout, bidirectional):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=num_layer, batch_first=True, bidirectional=bidirectional)
    if bidirectional:
      self.fc = nn.Linear(hidden_dim * 2, output_dim)
    else:
      self.fc = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  
  def forward_propagation(self, text):
    embedded = self.embedding(text)
    output, (hidden,cell) = self.lstm(embedded)
    output_with_dropout = self.dropout(output)
    result = self.fc(output_with_dropout)
    return result

In [0]:
EMBED_DIM = 150
HIDDEN_DIM = 100
NUM_LAYER = 1
DROP_OUT = 0.2

In [0]:
model = POSTag(vocab_size=word_dict_len,
               embed_dim=EMBED_DIM,
               hidden_dim=HIDDEN_DIM,
               num_layer=NUM_LAYER,
               output_dim =tag_dict_len,
               dropout=DROP_OUT,
               bidirectional=True)

In [50]:
model

POSTag(
  (embedding): Embedding(11915, 150)
  (lstm): LSTM(150, 100, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=200, out_features=40, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)

In [52]:
model.apply(init_weights)

POSTag(
  (embedding): Embedding(11915, 150)
  (lstm): LSTM(150, 100, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=200, out_features=40, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [54]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,996,890 trainable parameters


# Loss Function and Optimization

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.CrossEntropyLoss()

In [0]:
is_cuda = torch.cuda.is_available()

In [58]:
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [0]:
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def train_model(ml_model,dataset_iterator,optimizer,criterion, device):
  epoch_loss = 0
  ml_model.train()
  for data_batch in dataset_dl:
    feature = data_batch[0]
    feature = feature.to(device)
    label = data_batch[1]
    label = label.to(device)
    optimizer.zero_grad()
    prediction = ml_model.forward_propagation(feature)
    prediction = prediction.view(-1, prediction.shape[-1])
    label = label.view(-1)
    loss = criterion(prediction,label)
    loss.backward()
    optimizer.step()
    epoch_loss+=loss.item()
  dataset_len = len(dataset_dl)
  return epoch_loss/dataset_len

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 5

In [0]:
import time

In [64]:
for epoch in range(N_EPOCHS):

    start_time = time.time()
    loss = train_model(ml_model=model, dataset_iterator= dataset,optimizer=optimizer,criterion=criterion, device=device)
    end_time = time.time()
    minute,second = epoch_time(start_time, end_time)
    print("Time Tatken to run one epoch {} minute and {}second".format(minute, second))
    print("Loss Calucated in Epoch {} : {}".format(epoch+1,round(loss,2)))

Time Tatken to run one epoch 0 minute and 1second
Loss Calucated in Epoch 1 : 1.17
Time Tatken to run one epoch 0 minute and 1second
Loss Calucated in Epoch 2 : 0.61
Time Tatken to run one epoch 0 minute and 1second
Loss Calucated in Epoch 3 : 0.3
Time Tatken to run one epoch 0 minute and 1second
Loss Calucated in Epoch 4 : 0.13
Time Tatken to run one epoch 0 minute and 1second
Loss Calucated in Epoch 5 : 0.07


In [65]:
model.eval()

POSTag(
  (embedding): Embedding(11915, 150)
  (lstm): LSTM(150, 100, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=200, out_features=40, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
test_feature = data_df['ascii_text'][100]

In [0]:
test_label = data_df['tag'][100]

In [0]:
test_feature_num = [word2int[item] for item in test_feature]

In [0]:
test_feature_num = torch.tensor(test_feature_num).view(1,-1)

In [70]:
model.eval()

POSTag(
  (embedding): Embedding(11915, 150)
  (lstm): LSTM(150, 100, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=200, out_features=40, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
with torch.no_grad():
  prediction = model.forward_propagation(test_feature_num.to(device))

In [0]:
pred_numeric_label=[]
for vector in prediction[0]:
  pred_numeric_label.append(torch.argmax(vector).item())

In [0]:
predicted_result = [int2tag[item] for item in pred_numeric_label]

In [74]:
for idx in range(len(predicted_result)):
  print(test_label[idx],predicted_result[idx])

<NN> <NN>
<HRU> <HRU>
<PLAI> <PLAI>
<NN> <NN>
<NN> <NN>
<JJ> <JJ>
<VBI> <VBNE>
<CC> <CC>
<NN> <NN>
<VBI> <VBNE>
<PKO> <PKO>
<POP> <POP>
<NN> <NN>
<VBNE> <VBNE>
<NN> <NN>
<JJ> <JJ>
<NN> <NN>
<HRU> <HRU>
<POP> <POP>
<JJ> <JJ>
<NN> <NN>
<VBKO> <VBKO>
<VBX> <VBX>
<CC> <CC>
<DUM> <DUM>
<PLE> <PLE>
<YM> <JJ>
<NNP> <NNP>
<NNP> <NNP>
<NN> <NN>
<PKO> <PKO>
<NNP> <NN>
<NNP> <NN>
<CC> <CC>
<NNP> <NNP>
<FB> <NNP>
<NNP> <NNP>
<PKO> <PKO>
<FB> <RBO>
<NN> <NN>
<CC> <CC>
<NN> <NNP>
<NN> <NN>
<POP> <POP>
<PKO> <PKO>
<JJ> <JJ>
<NN> <NN>
<PLAI> <PLAI>
<JJ> <JJ>
<VBKO> <VBKO>
<VBF> <VBF>
<YF> <YF>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <PAD>
<PAD> <

In [0]:
test_data = 'मेरो नाम रडोलफ हो ।'

In [0]:
test_data = unicode_to_ascii(test_data)

In [77]:
test_data

'मरो नाम रडोलफ हो ।'

In [0]:
test_data = test_data.split(" ")

In [79]:
test_data

['मरो', 'नाम', 'रडोलफ', 'हो', '।']

In [0]:
test_data = padding_sequence(test_data)

In [0]:
test_data_num = [word2int[item] for item in test_data]

In [0]:
test_data_num = torch.tensor(test_data_num).view(1,-1)

In [83]:
model.eval()

POSTag(
  (embedding): Embedding(11915, 150)
  (lstm): LSTM(150, 100, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=200, out_features=40, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [0]:
with torch.no_grad():
  prediction1 = model.forward_propagation(test_data_num.to(device))

In [0]:
pred_numeric_label=[]
for vector in prediction1[0]:
  pred_numeric_label.append(torch.argmax(vector).item())

In [0]:
predicted_result = [int2tag[item] for item in pred_numeric_label]

In [87]:
for item in test_data:
  if item!= '<PAD>':
    print(item)

मरो
नाम
रडोलफ
हो
।


In [88]:
for item in predicted_result:
  if item!= '<PAD>':
    print(item)

<NNP>
<NN>
<NNP>
<VBX>
<YF>
