In [209]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import classification_report
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [210]:
def load_conll_file(file_path):
    data=[]
    with open(file_path, 'r') as f:
        sentence=[]
        for line in f:
            if line=='\n':
                data.append(sentence)
                sentence=[]
            else:
                sentence.append(line.strip().split('\t'))
    return data
    

In [212]:
def making_df(name='train.conll'):
    data = load_conll_file(name)

    #make data into a pandas dataframe with one column for a list of all the words in the sentence
    df = pd.DataFrame(data)

    all_sentences=[]
    all_tokens=[]
    for sentence in data:
        tokens=[]
        sent=[]
        for line in sentence:
            line = (line[0], line[1], line[2])
            sent.append(line)
            tokens.append(line[0])
        all_sentences.append(sent)
        all_tokens.append(tokens)


    df=pd.DataFrame()
    df['sentence_num']=df.index
    df['sentence_tags']=all_sentences

    #make a column with the index
    numbers=range(1,len(all_sentences)+1)

    df['sentence_num']=numbers
    df['tokens']=all_tokens



    dic={'words':[], 'inner_label':[], 'outer_label':[]}

    for sentence in all_sentences:
        words=''
        inner_label=''
        outer_label=''
        for x in sentence:
            for i in range(len(x)):
                if i==0:
                    words+=x[i]
                    words+=' '
                elif i==1:
                    inner_label+=x[i]
                    inner_label+=' '
                elif i==2:
                    outer_label+=x[i]
                    outer_label+=' '
        dic['words'].append(words)
        dic['inner_label'].append(inner_label)
        dic['outer_label'].append(outer_label)

    df['sentence']=dic['words']
    df['inner_label']=dic['inner_label']
    df['outer_label']=dic['outer_label']

    return df

df=making_df(name='train.conll')
test_df = making_df(name='test.conll')

In [192]:
data = load_conll_file('train.conll')

#make data into a pandas dataframe with one column for a list of all the words in the sentence
df = pd.DataFrame(data)

all_sentences=[]
all_tokens=[]
for sentence in data:
    tokens=[]
    sent=[]
    for line in sentence:
        line = (line[0], line[1], line[2])
        sent.append(line)
        tokens.append(line[0])
    all_sentences.append(sent)
    all_tokens.append(tokens)




df=pd.DataFrame()
df['sentence_num']=df.index
df['sentence_tags']=all_sentences

#make a column with the index
numbers=range(1,len(all_sentences)+1)

df['sentence_num']=numbers
df['tokens']=all_tokens

In [193]:
dic={'words':[], 'inner_label':[], 'outer_label':[]}

for sentence in all_sentences:
    words=''
    inner_label=''
    outer_label=''
    for x in sentence:
        for i in range(len(x)):
            if i==0:
                words+=x[i]
                words+=' '
            elif i==1:
                inner_label+=x[i]
                inner_label+=' '
            elif i==2:
                outer_label+=x[i]
                outer_label+=' '
    dic['words'].append(words)
    dic['inner_label'].append(inner_label)
    dic['outer_label'].append(outer_label)

df['sentence']=dic['words']
df['inner_label']=dic['inner_label']
df['outer_label']=dic['outer_label']



In [213]:
train_sentences_list=df['sentence'].tolist()

tokeniser= tf.keras.preprocessing.text.Tokenizer(lower=False,filters='')

tokeniser.fit_on_texts(train_sentences_list)

print("Vocab size of Tokeniser ",len(tokeniser.word_index)+1)

tokeniser.index_word[273]

Vocab size of Tokeniser  19673


'away'

In [214]:
train_encoded_sentence=tokeniser.texts_to_sequences(train_sentences_list)
print("First Original Sentence ",train_sentences_list[0])
print("First Encoded Sentence ",train_encoded_sentence[0])
print("Is Length of Original Sentence Same as Encoded Sentence ",len(train_sentences_list[0].split())==len(train_encoded_sentence[0]))
print("Length of First Sentence ",len(train_encoded_sentence[0]))

First Original Sentence  Al - Zaman : American forces killed Shaikh Abdullah al - Ani , the preacher at the mosque in the town of Qaim , near the Syrian border . 
First Encoded Sentence  [261, 15, 5149, 44, 287, 688, 1125, 4147, 9874, 582, 15, 9875, 3, 2, 6732, 34, 2, 6733, 9, 2, 485, 7, 6734, 3, 741, 2, 2179, 1571, 1]
Is Length of Original Sentence Same as Encoded Sentence  True
Length of First Sentence  29


In [215]:
def get_inner_tags(df):
    all_inner_tags=set()
    for row in df['inner_label']:
        row=row.split(' ')
        for i in row:
            if i!='':
                all_inner_tags.add(i)
    
    num_inner_tags=len(all_inner_tags)

    inner_tags_map={tag:i for i,tag in enumerate(all_inner_tags)}

    reverse_inner_tag_map={v: k for k, v in inner_tags_map.items()}

    inner_tags_list=df['inner_label'].apply(lambda x:x.split())

    inner_encoded_tags=[[inner_tags_map[w] for w in tag] for tag in inner_tags_list]

    return all_inner_tags, inner_encoded_tags

def get_outer_tags(df):
    all_outer_tags=set()
    for row in df['outer_label']:
        row=row.split(' ')
        for i in row:
            if i!='':
                all_outer_tags.add(i)

    outer_tags_map={tag:i for i,tag in enumerate(all_outer_tags)}

    reverse_outer_tag_map={v: k for k, v in outer_tags_map.items()}

    outer_tags_list=df['outer_label'].apply(lambda x:x.split())

    outer_encoded_tags=[[outer_tags_map[w] for w in tag] for tag in outer_tags_list]
    
    return all_outer_tags, outer_encoded_tags

In [216]:

all_inner_tags, inner_encoded_tags=get_inner_tags(df)


num_inner_tags=len(all_inner_tags)

inner_tags_map={tag:i for i,tag in enumerate(all_inner_tags)}

reverse_inner_tag_map={v: k for k, v in inner_tags_map.items()}

inner_tags_list=df['inner_label'].apply(lambda x:x.split())

inner_encoded_tags=[[inner_tags_map[w] for w in tag] for tag in inner_tags_list]
print("First Sentence ",train_sentences_list[0])
print('First Sentence Original Tags ',inner_tags_list[0])
print("First Sentence Encoded Tags ",inner_encoded_tags[0])
print("Is length of Original Tags and Encoded Tags same ",len(inner_tags_list[0])==len(inner_encoded_tags[0]))
print("Length of Tags for First Sentence ",len(inner_encoded_tags[0]))



max_sentence_length=max([len(s.split()) for s in train_sentences_list])
print(max_sentence_length)

First Sentence  Al - Zaman : American forces killed Shaikh Abdullah al - Ani , the preacher at the mosque in the town of Qaim , near the Syrian border . 
First Sentence Original Tags  ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOCderiv', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOCderiv', 'O', 'O']
First Sentence Encoded Tags  [9, 19, 19, 4, 11, 4, 4, 4, 21, 16, 16, 16, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 11, 4, 4]
Is length of Original Tags and Encoded Tags same  True
Length of Tags for First Sentence  29
159


In [217]:
all_outer_tags, outer_encoded_tags=get_outer_tags(df)

num_outer_tags=len(all_outer_tags)

outer_tags_map={tag:i for i,tag in enumerate(all_outer_tags)}

reverse_outer_tag_map={v: k for k, v in outer_tags_map.items()}

outer_tags_list=df['outer_label'].apply(lambda x:x.split())

outer_encoded_tags=[[outer_tags_map[w] for w in tag] for tag in outer_tags_list]
print("First Sentence ",train_sentences_list[0])
print('First Sentence Original Tags ',outer_tags_list[0])
print("First Sentence Encoded Tags ",outer_encoded_tags[0])
print("Is length of Original Tags and Encoded Tags same ",len(outer_tags_list[0])==len(outer_encoded_tags[0]))
print("Length of Tags for First Sentence ",len(outer_encoded_tags[0]))



max_sentence_length=max([len(s.split()) for s in train_sentences_list])
print(max_sentence_length)

First Sentence  Al - Zaman : American forces killed Shaikh Abdullah al - Ani , the preacher at the mosque in the town of Qaim , near the Syrian border . 
First Sentence Original Tags  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
First Sentence Encoded Tags  [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
Is length of Original Tags and Encoded Tags same  True
Length of Tags for First Sentence  29
159


In [218]:
max_len=128
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

train_padded_encoded_sentences=pad_sequences(maxlen=max_len,sequences=train_encoded_sentence,padding="post",value=0)
train_padded_encoded_inner_tags=pad_sequences(maxlen=max_len,sequences=inner_encoded_tags,padding="post",value=inner_tags_map['O'])
train_padded_encoded_outer_tags=pad_sequences(maxlen=max_len,sequences=outer_encoded_tags,padding="post",value=outer_tags_map['O'])

print("Shape of Encoded Sentence ",padded_encoded_sentences.shape)
print("Shape of Encoded Inner Labels ",padded_encoded_inner_tags.shape)

print("First Encoded Sentence Without Padding ",encoded_sentence[0])
print("First Encoded Sentence with padding ",padded_encoded_sentences[0])
print("First Sentence Encoded Label without Padding ",inner_encoded_tags[0])
print("First Sentence Encoded Label with Padding ",padded_encoded_inner_tags[0])

Shape of Encoded Sentence  (12543, 128)
Shape of Encoded Inner Labels  (12543, 128)
First Encoded Sentence Without Padding  [261, 15, 5149, 44, 287, 688, 1125, 4147, 9874, 582, 15, 9875, 3, 2, 6732, 34, 2, 6733, 9, 2, 485, 7, 6734, 3, 741, 2, 2179, 1571, 1]
First Encoded Sentence with padding  [ 261   15 5149   44  287  688 1125 4147 9874  582   15 9875    3    2
 6732   34    2 6733    9    2  485    7 6734    3  741    2 2179 1571
    1    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
First Sentence Encoded Label without Padding  [9, 19, 

<h1>TRANSFORMING TEST DATA</h1>

In [219]:
target_inner= [to_categorical(i,num_classes = num_inner_tags) for i in  train_padded_encoded_inner_tags]
print("Shape of Labels  after converting to Categorical for first sentence ",target_inner[0].shape)

target_outer= [to_categorical(i,num_classes = num_outer_tags) for i in  train_padded_encoded_outer_tags]
print("Shape of Labels  after converting to Categorical for first sentence ",target_outer[0].shape)


Shape of Labels  after converting to Categorical for first sentence  (128, 23)
Shape of Labels  after converting to Categorical for first sentence  (128, 17)


In [201]:
len(train_padded_encoded_inner_tags)

12543

In [220]:
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional

In [221]:
X_train=train_padded_encoded_sentences
Y_inner_train = target_inner
Y_outer_train = target_outer

In [222]:
embedding_dim=128
vocab_size=len(tokeniser.word_index)+1
lstm_units=128
max_len=128
HIDDEN_SIZE=128

input_word = Input(shape = (max_len,))
model = Embedding(input_dim = vocab_size+1,output_dim = embedding_dim,input_length = max_len)(input_word)

model = Bidirectional(LSTM(units=embedding_dim,return_sequences=True))(model)
inner_layer = TimeDistributed(Dense(num_inner_tags,activation = 'softmax'))(model)
outer_layer=  TimeDistributed(Dense(num_outer_tags,activation = 'softmax'))(model)
model = Model(input_word,[inner_layer,outer_layer])
model.summary()

# Compile the model with a suitable loss function for multi-label classification
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train, [np.array(Y_inner_train), np.array(Y_outer_train)], epochs=15, batch_size=32)

2023-04-24 14:33:17.365195: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-24 14:33:17.367777: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-24 14:33:17.372319: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 128, 128)     2518272     ['input_6[0][0]']                
                                                                                                  
 bidirectional_4 (Bidirectional  (None, 128, 256)    263168      ['embedding_5[0][0]']            
 )                                                                                                
                                                                                                  
 time_distributed_10 (TimeDistr  (None, 128, 23)     5911        ['bidirectional_4[0][0]']  

2023-04-24 14:33:19.412146: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-24 14:33:19.414170: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-24 14:33:19.416083: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f940ad61f10>

In [223]:
test_df=making_df('test.conll')
test_sentences_list = test_df['sentence'].to_list()
test_encoded_sentence=tokeniser.texts_to_sequences(test_sentences_list)

test_encoded_sentence



[[3037, 10, 6, 369, 36, 6, 314, 313, 32],
 [3037, 10, 495, 6, 313, 350],
 [3037, 10, 9, 2612, 592],
 [299, 134, 10, 6, 369, 18233, 9, 39, 313, 32],
 [92, 464, 2, 293],
 [5745],
 [],
 [210, 2167, 35, 12, 498, 9, 32],
 [2167, 7, 614, 3, 1],
 [1561, 48],
 [19110],
 [12, 576, 16, 87, 19110, 57],
 [704, 12, 50, 80, 55, 738],
 [99,
  9139,
  5,
  8,
  94,
  2605,
  7,
  14,
  99,
  119,
  33,
  1964,
  16,
  291,
  3694,
  136,
  11,
  375,
  99],
 [402, 12, 252, 80, 1773, 9, 2610, 2611, 32],
 [8, 31, 21, 57],
 [602, 1],
 [171, 22, 430, 66, 179, 62, 1032, 2995, 1],
 [6315, 660, 33, 418, 293],
 [157, 1964, 25, 5314, 9, 82, 107, 180, 298, 1419],
 [89, 9179, 21, 120, 4, 921, 5, 89, 16, 3455, 32],
 [],
 [274, 5060, 1],
 [210, 18626, 11890, 25, 1416, 7, 9015, 32],
 [89,
  172,
  599,
  11,
  4191,
  11,
  157,
  23,
  155,
  45,
  34,
  52,
  14,
  332,
  73,
  18626,
  23,
  73],
 [],
 [210, 35, 12, 498, 9, 32],
 [2169,
  61,
  2,
  1026,
  30,
  498,
  9,
  2735,
  596,
  61,
  9824,
  3,
  190

In [206]:
all_tags=set()
for row in df['inner_label']:
    row=row.split(' ')
    for i in row:
        if i!='':
            all_tags.add(i)

num_tags=len(all_tags)

tags_map={tag:i for i,tag in enumerate(all_tags)}

reverse_tag_map={v: k for k, v in tags_map.items()}


tags_list=df['inner_label'].apply(lambda x:x.split())

encoded_tags=[[tags_map[w] for w in tag] for tag in tags_list]
print("First Sentence ",sentences_list[0])
print('First Sentence Original Tags ',tags_list[0])
print("First Sentence Encoded Tags ",encoded_tags[0])
print("Is length of Original Tags and Encoded Tags same ",len(tags_list[0])==len(encoded_tags[0]))
print("Length of Tags for First Sentence ",len(encoded_tags[0]))


max_sentence_length=max([len(s.split()) for s in sentences_list])
print(max_sentence_length)

First Sentence  Al - Zaman : American forces killed Shaikh Abdullah al - Ani , the preacher at the mosque in the town of Qaim , near the Syrian border . 
First Sentence Original Tags  ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOCderiv', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOCderiv', 'O', 'O']
First Sentence Encoded Tags  [9, 19, 19, 4, 11, 4, 4, 4, 21, 16, 16, 16, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 11, 4, 4]
Is length of Original Tags and Encoded Tags same  True
Length of Tags for First Sentence  29
159


In [224]:
from sklearn.metrics import f1_score

test_sentences_list=test_df['sentence'].tolist()

test_encoded_sentence=tokeniser.texts_to_sequences(test_sentences_list)
test_padded_encoded_sentences=pad_sequences(maxlen=max_len,sequences=test_encoded_sentence,padding="post",value=0)

inner_preds, outer_preds=model.predict(test_padded_encoded_sentences)

inner_pred_labels = np.argmax(inner_preds, axis=-1).flatten()
outer_pred_labels=np.argmax(outer_preds,axis=-1).flatten()

y_actual_inner=test_df['inner_label'].apply(lambda x:x.split())
y_actual_outer=test_df['outer_label'].apply(lambda x:x.split())

test_inner_encoded_tags=[[inner_tags_map[w] for w in tag] for tag in y_actual_inner]
test_outer_encoded_tags=[[outer_tags_map[w] for w in tag] for tag in y_actual_outer]



inner_actuals=pad_sequences(maxlen=max_len,sequences=test_inner_encoded_tags,padding="post",value=tags_map['O'])
inner_actuals=inner_actuals.flatten()

outer_actuals=pad_sequences(maxlen=max_len,sequences=test_outer_encoded_tags,padding="post",value=tags_map['O'])
outer_actuals=outer_actuals.flatten()





inner_report = f1_score(inner_actuals, inner_pred_labels, average='macro')
print('inner: ', inner_report)

outer_report=f1_score(outer_actuals, outer_pred_labels, average='macro')
print('outer: ', outer_report)

2023-04-24 14:55:56.438610: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-24 14:55:56.454291: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-24 14:55:56.467740: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

inner:  0.33708640715613475
outer:  0.10000193357773757


In [208]:
def evaluatePredictions(test_data,preds,actual_preds):
    print("Shape of Test Data Array",test_data.shape)
    y_actual=np.argmax(np.array(actual_preds),axis=2)
    y_pred=np.argmax(preds,axis=2)
    num_test_data=test_data.shape[0]
    print("Number of Test Data Points ",num_test_data)
    data=pd.DataFrame()
    df_list=[]
    for i in range(num_test_data):
        test_str=list(test_data[i])
        df=pd.DataFrame()
        df['test_tokens']=test_str
        df['tokens']=df['test_tokens'].apply(lambda x:tokeniser.index_word[x] if x!=0 else '<PAD>')
        df['actual_target_index']=list(y_actual[i])
        df['pred_target_index']=list(y_pred[i])
        df['actual_target_tag']=df['actual_target_index'].apply(lambda x:reverse_tag_map[x])
        df['pred_target_tag']=df['pred_target_index'].apply(lambda x:reverse_tag_map[x])
        df['id']=i+1
        df_list.append(df)
    data=pd.concat(df_list)
    pred_data=data[data['tokens']!='<PAD>']
    accuracy=pred_data[pred_data['actual_target_tag']==pred_data['pred_target_tag']].shape[0]/pred_data.shape[0]
    
    
    return pred_data,accuracy

