In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

In [2]:
data = pd.read_csv("ner_dataset.txt", encoding= 'unicode_escape', sep =',')

In [3]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
data.shape

(1048575, 4)

In [5]:
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [6]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [7]:
x = data['Word'].map(token2idx)

In [8]:
token2idx

{'petrochemicals': 0,
 'rain-fed': 1,
 '1004': 2,
 'dinosaurs': 3,
 'Tyre': 4,
 'traditional': 5,
 'sow': 6,
 'humor': 7,
 'brings': 8,
 'Weyn': 9,
 '1935': 10,
 'prefer': 11,
 'Otto': 12,
 'rutile': 13,
 'quotations': 14,
 'IMF-World': 15,
 'shared': 16,
 'Jimenez': 17,
 'three-story': 18,
 'Sabeel': 19,
 'enclaves': 20,
 'leveraged': 21,
 'cursed': 22,
 'al-Qaim': 23,
 'Unocal': 24,
 'Juvenile': 25,
 'Alastair': 26,
 'SANA': 27,
 'appropriately': 28,
 '70th': 29,
 'spacewalk': 30,
 'IEDs': 31,
 're-structuring': 32,
 'chains': 33,
 'sponsors': 34,
 'primaries': 35,
 'Chookiat': 36,
 'southwestern': 37,
 'coexist': 38,
 'Delegations': 39,
 'Think': 40,
 'brutalized': 41,
 'industry-backed': 42,
 'Ivanov': 43,
 'Memoir': 44,
 'robust': 45,
 'Yong-Chun': 46,
 'rocketed': 47,
 'step': 48,
 'fuss': 49,
 'Mohammadi': 50,
 'Meishan': 51,
 'Lund': 52,
 'kicks': 53,
 'burning': 54,
 'Ion': 55,
 'effected': 56,
 'inter-religious': 57,
 'Keesler': 58,
 'Qiang': 59,
 'Christa': 60,
 'Refugees': 

In [9]:
data.Word[:25]

0         Thousands
1                of
2     demonstrators
3              have
4           marched
5           through
6            London
7                to
8           protest
9               the
10              war
11               in
12             Iraq
13              and
14           demand
15              the
16       withdrawal
17               of
18          British
19           troops
20             from
21             that
22          country
23                .
24         Families
Name: Word, dtype: object

In [10]:
x

0          29922
1           7340
2           2338
3           8951
4           5941
           ...  
1048570     7156
1048571    28089
1048572    24026
1048573    31351
1048574     7546
Name: Word, Length: 1048575, dtype: int64

In [11]:
x = [x for x in data.Word if x == 'of']

In [12]:
len(x)

26354

In [13]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [14]:
from sklearn.model_selection import train_test_split
# from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

2022-09-01 14:49:03.609110: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-01 14:49:03.609155: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [15]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [16]:
print('hello')

hello


In [17]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [18]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [19]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss


In [20]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

2022-09-01 14:51:46.013232: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-09-01 14:51:46.013271: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-01 14:51:46.013305: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c4leb-PC): /proc/driver/nvidia/version does not exist
2022-09-01 14:51:46.014291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2251456   
                                                                 
 bidirectional (Bidirectiona  (None, 104, 128)         66048     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed (TimeDistr  (None, 104, 17)          1105      
 ibuted)                                                         
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________
You must install pydot (`pip install pydot`) a

2022-09-01 14:51:46.879212: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 183143584 exceeds 10% of free system memory.




2022-09-01 14:54:16.828610: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 183143584 exceeds 10% of free system memory.




2022-09-01 14:55:48.086782: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 183143584 exceeds 10% of free system memory.




2022-09-01 14:57:16.546116: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 183143584 exceeds 10% of free system memory.




2022-09-01 14:59:38.642763: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 183143584 exceeds 10% of free system memory.




In [28]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Aman Kharwal \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration')
displacy.render(text, style = 'ent', jupyter=True)