<a href="https://colab.research.google.com/github/Bollash/Entity-recognition-hw/blob/main/Entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !pip install transformers

In [None]:
import pandas as pd
import urllib.request
import gzip
import shutil
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [None]:
#Downloading the dataset.
url = "http://hlt.sztaki.hu/resources/hunnerwiki/huwiki.1.ner.tsv.gz"
local_file = "data.tsv.gz"
urllib.request.urlretrieve(url, local_file)

In [None]:
#Decompressing the data
with gzip.open(local_file, 'rb') as f_in:
    with open('file.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
#Loading the data into data_set. There are lines that produce errors. We filter them out by using the third parameter
data_set = pd.read_csv('file.tsv', sep='\t', error_bad_lines=False)

In [None]:
data_set.head

In [None]:
#Drop the empty lines
data_set = data_set.dropna()

smaller = data_set[['A','O']]
#Using a fraction of the original data since there are 2.3 million elements.
smaller = smaller[0:200000]

In [None]:
#Some lines are NaN. We filter them out this way.
smaller = smaller[smaller['O'].notnull()]

In [None]:
#To convert the tags into numbers we'll use a dictionary. Currently there is no 'PAD', but we shall add them later.
d = {
    'B-LOC' : 0,
    'B-MISC' : 1,
    'B-ORG' : 2,
    'B-PER' : 3,
    'I-LOC' : 4,
    'I-MISC' : 5,
    'I-ORG' : 6,
    'I-PER' : 7,
    'O' : 8,
    'PAD' : 9,
    'BOS' : 10,
    'EOS' : 11
}
#Converting the tags
for idx, row in smaller.iterrows():
  row[1] = d[row[1]]

In [None]:
#Reconstructing the sentences.

#Used to filter out punctuation.
garbage = {'.', ',', '\"', "\'", '/', '\\', '(', ')', '\'', ':', '?', '!', '’', '-', ';'}
sentences = []
y_sentences = []
sent = []
y_sent = []
for idx, row in smaller.iterrows():
  #Theese characters mark the end of a sentence.
  if row[0] in {'.', '?', '!', ':'}:
    sentences.append(sent)
    sent = []
    y_sentences.append(y_sent)
    y_sent = []
    continue
  if row[0] not in garbage:
    sent.append(row[0])
    y_sent.append(row[1])
  

In [None]:
#We're usning hubert base cc tokenizer
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

def tokenize_sentence(sentence, tokenizer):
  tokenized = []
  for word in sentence:
    tokenized.append(tokenizer.tokenize(word))
  return tokenized

In [None]:
#Tokenize the input sentences.
#We can't use the lemmas because Budapest is an entity but budapesti isn't.
#This makes it, that most of our words consist of 1 token.

max_word_len = 0
tokenized_sentences = []
for sent in sentences:
  tokenized_sent = tokenize_sentence(sent, tokenizer)
  for tok_word in tokenized_sent:
    max_word_len = max(max_word_len, len(tok_word))
  tokenized_sentences.append(tokenized_sent)

In [None]:
#Inserting a BOS(beginning of sentence) and EOS(end of sentence) token. It's needed for the ngrams.
for sentence in tokenized_sentences:
  sentence.insert(0, ['BOS'])
  sentence.append(['EOS'])
#Inserting BOS and EOS values to the output values aswell
for y in y_sentences:
  y.insert(0, 10)
  y.append(11)

#Padding the sentences
for sentence in tokenized_sentences:
  for word in sentence:
    for _ in range(max_word_len - len(word)):
      word.append('PAD')

In [None]:
#Instead of the sentences we use ngrams. It makes it so theres less padding needed, and also we can use ngram based accuracy.

def make_ngrams(sentence, n):
  ngrams = []
  for i in range(len(sentence) - n + 1):
    ngram = []
    for j in range(n):
      ngram.append(sentence[i + j])
    ngrams.append(ngram)
  return ngrams

In [None]:
#Currently using bigramms, but it can be changed upwards if needed.
ngram_size = 2
ngrams = []
for sentence in tokenized_sentences:
  ngrams.append(make_ngrams(sentence, ngram_size))

#When we convert to ngrams, theres one less ngram than the input size. So we need to cut the last output element.
for y_sentence in y_sentences:
  for _ in range(ngram_size - 1):
    y_sentence.pop()

In [None]:
#One-Hot encode our tags
for i in range(len(y_sentences)):
  y_sentences[i] = to_categorical(y_sentences[i], 12)

In [None]:
#Flatten our data, so it's easier to work with.

x_ngrams = []
for sentence in ngrams:
  for ngram in sentence:
    x = []
    for word in ngram:
      for token in word:
        x.append(token)
    x_ngrams.append(x)

y_ngrams = []
for sentence in y_sentences:
  for ngram in sentence:
    y_ngrams.append(ngram)

In [None]:
#We use a BoW(bag of words) method for the word to number conversion. An embedding layer could be used instead if we need a better accuracy.

#Filling our vocab with the tokens.
#Currently even the test data is put into it. It would be better no not include them and make an universal number for the tokens that are not inculded in the vocab.
vocab, index = {}, 1  # start indexing from 1
vocab['PAD'] = 0  # add a padding token
for word in x_ngrams:
  for token in word:
    if token not in vocab:
      vocab[token] = index
      index += 1
# 'Out of vocabulary' for unknown words
vocab['OOV'] = index

vocab_size = len(vocab)
print(vocab)

In [None]:
#Converting our input into integers, with our BoW vocab.

embedded_x_ngrams = []
for ngram in x_ngrams:
  embedded_x_ngram = []
  for token in ngram:
    embedded_x_ngram.append(vocab[token])
  embedded_x_ngrams.append(embedded_x_ngram)

In [None]:
#Converting the lists into Numpy arrays
sentences = np.array(embedded_x_ngrams)
y_sentences = np.array(y_ngrams)

In [None]:
#Split the data into train, test, and val data
#train : 0.6
#test : 0.2
#val: 0.2
#Since the train test split splits the data into 2 parts we have to call it twice
x_train, x_test, y_train, y_test = train_test_split(sentences, y_sentences, test_size=0.2, random_state=123)
#We need test_size=0.25 since 0.2 / 0.8 == 0.25
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=123)

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
input_shape = (28,)
max_features = vocab_size
embedding_size = 64
lstm1_size = 256
lstm2_size = 256
optim = "adam"
batch_size = 256
inputs = keras.Input(shape=input_shape, dtype="int32")

x = layers.Embedding(max_features, embedding_size)(inputs)

x = layers.Bidirectional(layers.LSTM(lstm1_size, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(lstm2_size))(x)

outputs = layers.Dense(12, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_accuracy', patience=5, verbose=0)]

In [None]:
model.compile(optim, "categorical_crossentropy", metrics=["accuracy"])

In [None]:

model.fit(x_train, y_train, batch_size=batch_size, epochs=100, callbacks=callbacks, validation_data=(x_val, y_val))

In [None]:
prediction = model.predict(x_test)

In [None]:
def compare(pred, origi):
  return np.argmax(pred) == np.argmax(origi)

In [None]:
#Checking the accuracy on the test data
correct_predictions = 0
for i in range(len(prediction)):
  if(compare(prediction[i], y_test[i])):
    correct_predictions += 1

print(f"Accuracy on the test data is: {correct_predictions / len(prediction):.4f}%")

In [None]:
#We have to look at the prediction, to see if theres some anomaly there.
values = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for pred in prediction:
  idx = np.argmax(pred)
  values[idx] += 1

#As we can see nearly all the values get predicted. Only EoS and Pad is missing, but that was expected.
values

In [None]:
# Demonstrating the results with sentences out of original dataset
sample_sentences=[['A', 'tudomány', 'szerint', 'is', 'működik', 'az', 'alvásmódszer', 'amit', 'Salvador', 'Dalí', 'is', 'használt'], ['Ronaldo', 'megint', 'meccset', 'nyert', 'a', 'Manchester', 'Unitednek'], ['Öt', 'ok', 'amiért', 'Macron', 'Budapestre', 'látogat']]
sample_y_sentences=[[8,8,8,8,8,8,8,8,3,7,8,8],[3,8,8,8,8,3,7],[8,8,8,3,0,8]]

In [None]:
# We are doing the same process as we did it with the original dataset in the beginning of the notebook
sample_tokenized_sentences = []
for sent in sample_sentences:
  sample_tokenized_sentences.append(tokenize_sentence(sent, tokenizer))

In [None]:
for sentence in sample_tokenized_sentences:
  sentence.insert(0, ['BOS'])
  sentence.append(['EOS'])

for y in sample_y_sentences:
  y.insert(0, 10)
  y.append(11)

for sentence in sample_tokenized_sentences:
  for word in sentence:
    for _ in range(max_word_len - len(word)):
      word.append('PAD')

In [None]:
sample_ngrams = []
for sentence in sample_tokenized_sentences:
  sample_ngrams.append(make_ngrams(sentence, ngram_size))

for y_sentence in sample_y_sentences:
  for _ in range(ngram_size - 1):
    y_sentence.pop()

In [None]:
for i in range(len(sample_y_sentences)):
  sample_y_sentences[i] = to_categorical(sample_y_sentences[i], 12)

In [None]:
sample_x_ngrams = []
for sentence in sample_ngrams:
  for ngram in sentence:
    x = []
    for word in ngram:
      for token in word:
        x.append(token)
    sample_x_ngrams.append(x)

sample_y_ngrams = []
for sentence in sample_y_sentences:
  for ngram in sentence:
    sample_y_ngrams.append(ngram)

In [None]:
sample_embedded_x_ngrams = []
for ngram in sample_x_ngrams:
  embedded_x_ngram = []
  for token in ngram:
    if token not in vocab:
      embedded_x_ngram.append(vocab['OOV'])
    else:
      embedded_x_ngram.append(vocab[token])
  sample_embedded_x_ngrams.append(embedded_x_ngram)

In [None]:
sample_sentences = np.array(sample_embedded_x_ngrams)
sample_y_sentences = np.array(sample_y_ngrams)

In [None]:
# Predicting the tags
sample_prediction=model.predict(sample_sentences)

In [None]:
# Having a look at the accuracy of the sample sentences
sample_correct_predictions = 0
for i in range(len(sample_prediction)):
  if(compare(sample_prediction[i], sample_y_sentences[i])):
    sample_correct_predictions += 1

print(f"Accuracy on the sample sentences is: {sample_correct_predictions / len(sample_prediction):.4f}%")

In [None]:
# This is the inverse dictionary of 'd' dictionary, that is - 0 --> BOS like dictionary
inverse_d={}
for key, value in d.items():
  inverse_d[value]=key

In [None]:
# The predicted tags are:
[inverse_d[np.argmax(sample_prediction[i])] for i in range(len(sample_prediction))]

In [None]:
# The true y's were these
[inverse_d[i] for i in [10,8,8,8,8,8,8,8,8,3,7,8,8,10,3,8,8,8,8,3,7,10,8,8,8,3,0,8]]