In [1]:
import json
import os

path = ''

with open(os.path.join(path,'train.json'), 'r') as f:
     train = json.loads(f.read())

with open(os.path.join(path,'test.json'), 'r') as f:
     test = json.loads(f.read())

In [2]:
print(train['text'][0])
print(train['index'][0])
print(train['POS'][0])
print(train['NER'][0])

['Romania', 'state', 'budget', 'soars', 'in', 'June', '.', 'BUCHAREST', '1996-08-28', 'Romania', "'s", 'state', 'budget', 'deficit', 'jumped', 'sharply', 'in', 'June', 'to', '1,242.9', 'billion', 'lei', 'for', 'the', 'January-June', 'period', 'from', '596.5', 'billion', 'lei', 'in', 'January-May', ',', 'official', 'data', 'showed', 'on', 'Wednesday', '.', 'Six-month', 'expenditures', 'stood', 'at', '9.50', 'trillion', 'lei', ',', 'up', 'from', '7.56', 'trillion', 'lei', 'at', 'end-May', ',', 'with', 'education', 'and', 'health', 'spending', 'accounting', 'for', '31.6', 'percent', 'of', 'state', 'expenses', 'and', 'economic', 'subsidies', 'and', 'support', 'taking', 'some', '26', 'percent', '.', 'January-June', 'revenues', 'went', 'up', 'to', '8.26', 'trillion', 'lei', 'from', '6.96', 'trillion', 'lei', 'in', 'the', 'first', 'five', 'months', 'this', 'year', '.', 'Romania', "'s", 'government', 'is', 'expected', 'to', 'revise', 'the', '1996', 'budget', 'on', 'Wednesday', 'to', 'bring', '

In [3]:
def format_output_labels(token_labels, token_indices):

    label_dict = {"LOC":[], "MISC":[], "ORG":[], "PER":[]}
    prev_label = token_labels[0]
    start = token_indices[0]
    for idx, label in enumerate(token_labels):
      if prev_label != label:
        end = token_indices[idx-1]
        if prev_label != "O":
            label_dict[prev_label].append((start, end))
        start = token_indices[idx]
      prev_label = label
      if idx == len(token_labels) - 1:
        if prev_label != "O":
            label_dict[prev_label].append((start, token_indices[idx]))
    return label_dict

In [4]:
# Code for mean F1

import numpy as np

def mean_f1(y_pred_dict, y_true_dict):

    F1_lst = []
    for key in y_true_dict:
        TP, FN, FP = 0, 0, 0
        num_correct, num_true = 0, 0
        preds = y_pred_dict[key]
        trues = y_true_dict[key]
        for true in trues:
            num_true += 1
            if true in preds:
                num_correct += 1
            else:
                continue
        num_pred = len(preds)
        if num_true != 0:
            if num_pred != 0 and num_correct != 0:
                R = num_correct / num_true
                P = num_correct / num_pred
                F1 = 2*P*R / (P + R)
            else:
                F1 = 0 
        else:
            continue
        F1_lst.append(F1)
    return np.mean(F1_lst)

In [5]:


pred_token_labels = ["ORG", "O", "PER", "PER", "O", "LOC", "O", "O", "O", "O", "MISC", "O", "O", "O", "O", "LOC"]
true_token_labels = ["ORG", "O", "PER", "PER", "O", "LOC", "O", "O", "O", "O", "MISC", "MISC", "O", "O", "O", "LOC"]
token_indices = [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]

y_pred_dict = format_output_labels(pred_token_labels, token_indices)
print("y_pred_dict is : " + str(y_pred_dict))
y_true_dict = format_output_labels(true_token_labels, token_indices)
print("y_true_dict is : " + str(y_true_dict))

print("Entity Level Mean F1 score is : " + str(mean_f1(y_pred_dict, y_true_dict)))

y_pred_dict is : {'LOC': [(18, 18), (28, 28)], 'MISC': [(23, 23)], 'ORG': [(13, 13)], 'PER': [(15, 16)]}
y_true_dict is : {'LOC': [(18, 18), (28, 28)], 'MISC': [(23, 24)], 'ORG': [(13, 13)], 'PER': [(15, 16)]}
Entity Level Mean F1 score is : 0.75


In [6]:

import tensorflow as tf
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# PREPROCESSING our data for our ML model ------------------------------#
nlp = spacy.load('en_core_web_md')
train = pd.read_json('train.json')
test = pd.read_json('test.json')

def word2vec(word):
    return nlp.vocab[word].vector


def process_data(data, categories, pos_categories, train = True):
    processed = []
    for i in range(len(data)):
        words = data['text'].iloc[i]
        pos = data['POS'].iloc[i]
        ner = []
        ind = data['index'].iloc[i]
        if(train):
            ner = data['NER'].iloc[i]

        for j in range(len(words)):
            row = {
                'word': words[j],
                'pos': pos[j],
                'vec' : word2vec(words[j]),
                'ind' : ind[j]
            }

            if(train):
                row['y'] = ner[j]

            processed.append(row)
    
    processed = pd.DataFrame(processed)

    # One hot encoding
    if(train):
        # set every row to an array of zeros
        encoded = []
        poss = []

        for row in range(len(processed)):
            arr = np.zeros(len(categories))
            arr[categories.tolist().index(processed['y'].iloc[row])] = 1

            # encode the pos column
            pos_arr = np.zeros(len(pos_categories))
            pos_arr[pos_categories.tolist().index(processed['pos'].iloc[row])] = 1

            encoded.append(arr)
            poss.append(pos_arr)
    
        processed['y_encoded'] = encoded
        processed['pos_encoded'] = poss

        processed['x'] = processed.apply(lambda x : np.concatenate((x['vec'], x['pos_encoded'])), axis=1)
    else:
        poss = []
        for row in range(len(processed)):
            pos_arr = np.zeros(len(pos_categories))
            pos_arr[pos_categories.tolist().index(processed['pos'].iloc[row])] = 1

            poss.append(pos_arr)

        processed['pos_encoded'] = poss
        processed['x'] = processed.apply(lambda x : np.concatenate((x['vec'], x['pos_encoded'])), axis=1)
    
    return processed

concat = []
concat2 = []
for i in range(len(train)):
    concat = concat + train['NER'].iloc[i]
    concat2 = concat2 + train['POS'].iloc[i]

categories = np.unique(concat)
pos_categories = np.unique(concat2)

train, validation = train_test_split(train, test_size=0.2, random_state=42)
train_processed = process_data(train, categories, pos_categories)
validation_processed = process_data(validation, categories, pos_categories)
test_processed = process_data(test, categories, pos_categories, train=False)

print("Val length ")


TRAIN_NOW = False
if(TRAIN_NOW):
    # TRAINING our model ---------------------------------------------------#
    import tensorflow as tf

    # Make a nueral network with (345 input nodes, 5 hidden layers with dropout, 5 output nodes)
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(345, input_shape=(345,), activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(200, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(20, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

    model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])

    # Train the model and save the best model
    from tensorflow.keras.callbacks import ModelCheckpoint

    checkpoint_name = "modelCheckpoints/Weights-matchmodel2-{epoch:03d}--{val_loss:.5f}.hdf5"
    checkpoint = ModelCheckpoint(checkpoint_name, monitor="val_loss", verbose=1, save_best_only=True, mode="auto")
    callbacks_list = [checkpoint]

    model.fit(np.array(train_processed['x'].tolist()), np.array(train_processed['y_encoded'].tolist()), epochs=60, batch_size=32, callbacks=callbacks_list, validation_data=(np.array(validation_processed['x'].tolist()), np.array(validation_processed['y_encoded'].tolist())))
    # ------------------------------------------------------------------#


# TESTING our model --------------------------------------------------#
model = tf.keras.models.load_model('modelCheckpoints/Weights-matchmodel2-059--0.15338.hdf5')
# Make predictions
predictions = model.predict(np.array(validation_processed['x'].tolist()), verbose=0)

# Get the index of the highest value in each row
predictions = np.argmax(predictions, axis=1)

true_token_labels = validation_processed['y'].tolist()
pred_token_labels = [categories[pred] for pred in predictions]
token_indices = validation_processed['ind'].tolist()

y_pred_dict = format_output_labels(pred_token_labels, token_indices)
y_true_dict = format_output_labels(true_token_labels, token_indices)

# Calculate the F1 score
f1_score = mean_f1(y_true_dict, y_pred_dict)

print("The F1 score is: ", f1_score)
# ------------------------------------------------------------------#

The F1 score is:  0.674774266620459


In [27]:
import csv

def create_submission(output_filepath, token_labels, token_inds):

    label_dict = format_output_labels(token_labels, token_inds)
    with open(output_filepath, mode='w') as csv_file:
        fieldnames = ['Id', 'Predicted']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for key in label_dict:
            p_string = " ".join([str(start)+"-"+str(end) for start,end in label_dict[key]])
            writer.writerow({'Id': key, 'Predicted': p_string})

In [29]:

import random
random.seed(43)

test_pred_labels = []
test_pred_inds = []

predictions = model.predict(np.array(test_processed['x'].tolist()), verbose=0)
predictions = np.argmax(predictions, axis=1)
predictions = [categories[pred] for pred in predictions]

test_pred_labels = predictions
test_pred_inds = test_processed['ind'].tolist()

# generate the file with predictions (the predicted_random.csv entry on kaggle)
create_submission(path + "predicted.csv", test_pred_labels, test_pred_inds)