<a href="https://colab.research.google.com/github/Bollash/Entity-recognition-hw/blob/main/Entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !pip install transformers



In [None]:
import pandas as pd
import urllib.request
import gzip
import shutil
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [None]:
#Downloading the dataset.
url = "http://hlt.sztaki.hu/resources/hunnerwiki/huwiki.1.ner.tsv.gz"
local_file = "data.tsv.gz"
urllib.request.urlretrieve(url, local_file)

('data.tsv.gz', <http.client.HTTPMessage at 0x7f40fda92b50>)

In [None]:
#Decompressing the data
with gzip.open(local_file, 'rb') as f_in:
    with open('file.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
#Loading the data into data_set. There are lines that produce errors. We filter them out by using the third parameter
data_set = pd.read_csv('file.tsv', sep='\t', error_bad_lines=False)

b'Skipping line 121529: expected 6 fields, saw 16498\nSkipping line 121533: expected 6 fields, saw 10\nSkipping line 121537: expected 6 fields, saw 8198\n'


In [None]:
data_set.head

<bound method NDFrame.head of                      A  text  0                           ART             a  O
0               céljuk  text  0              NOUN<POSS<PLUR>>           cél  O
1                    ,  text  0                         PUNCT             ,  O
2                 hogy  text  0                          CONJ          hogy  O
3          biztosítsák  text  0  VERB<SUBJUNC-IMP><PLUR><DEF>      biztosít  O
4                    ,  text  0                         PUNCT             ,  O
...                ...   ... ..                           ...           ... ..
2237028            280  text  0                           NUM           280  O
2237029           km/h  text  0                          NOUN          km/h  O
2237030              a  text  0                           ART             a  O
2237031  végsebbessége  text  0                    NOUN<POSS>  végsebbesség  O
2237032              .  text  0                         PUNCT             .  O

[2237033 rows x 6 col

In [None]:
#Drop the empty lines
data_set = data_set.dropna()

smaller = data_set[['A','O']]
#Using a fraction of the original data since there are 2.3 million elements.
smaller = smaller[0:200000]

In [None]:
#Some lines are NaN. We filter them out this way.
smaller = smaller[smaller['O'].notnull()]

In [None]:
#To convert the tags into numbers we'll use a dictionary. Currently there is no 'PAD', but we shall add them later.
d = {
    'B-LOC' : 0,
    'B-MISC' : 1,
    'B-ORG' : 2,
    'B-PER' : 3,
    'I-LOC' : 4,
    'I-MISC' : 5,
    'I-ORG' : 6,
    'I-PER' : 7,
    'O' : 8,
    'PAD' : 9,
    'BOS' : 10,
    'EOS' : 11
}
#Converting the tags
for idx, row in smaller.iterrows():
  row[1] = d[row[1]]

In [None]:
#Reconstructing the sentences.

#Used to filter out punctuation.
garbage = {'.', ',', '\"', "\'", '/', '\\', '(', ')', '\'', ':', '?', '!', '’', '-', ';'}
sentences = []
y_sentences = []
sent = []
y_sent = []
for idx, row in smaller.iterrows():
  #Theese characters mark the end of a sentence.
  if row[0] in {'.', '?', '!', ':'}:
    sentences.append(sent)
    sent = []
    y_sentences.append(y_sent)
    y_sent = []
    continue
  if row[0] not in garbage:
    sent.append(row[0])
    y_sent.append(row[1])
  

In [None]:
#We're usning hubert base cc tokenizer
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

def tokenize_sentence(sentence, tokenizer):
  tokenized = []
  for word in sentence:
    tokenized.append(tokenizer.tokenize(word))
  return tokenized

In [None]:
#Tokenize the input sentences.
#We can't use the lemmas because Budapest is an entity but budapesti isn't.
#This makes it, that most of our words consist of 1 tokens.

max_word_len = 0
tokenized_sentences = []
for sent in sentences:
  tokenized_sent = tokenize_sentence(sent, tokenizer)
  for tok_word in tokenized_sent:
    max_word_len = max(max_word_len, len(tok_word))
  tokenized_sentences.append(tokenized_sent)

In [None]:
#Inserting a BOS(beginning of sentence) and EOS(end of sentence) token. It's needed for the ngrams.
for sentence in tokenized_sentences:
  sentence.insert(0, ['BOS'])
  sentence.append(['EOS'])
#Inserting BOS and EOS values to the output values aswell
for y in y_sentences:
  y.insert(0, 10)
  y.append(11)

#Padding the sentences
for sentence in tokenized_sentences:
  for word in sentence:
    for _ in range(max_word_len - len(word)):
      word.append('PAD')

In [None]:
#Instead of the sentences we use ngrams. It makes it so theres less padding needed, and also we can use ngram based accuracy.

def make_ngrams(sentence, n):
  ngrams = []
  for i in range(len(sentence) - n + 1):
    ngram = []
    for j in range(n):
      ngram.append(sentence[i + j])
    ngrams.append(ngram)
  return ngrams

In [None]:
#Currently using bigramms, but it can be changed upwards if needed.
ngram_size = 2
ngrams = []
for sentence in tokenized_sentences:
  ngrams.append(make_ngrams(sentence, ngram_size))

#When we convert to ngrams, theres one less ngram than the input size. So we need to cut the last output element.
for y_sentence in y_sentences:
  for _ in range(ngram_size - 1):
    y_sentence.pop()

In [None]:
#One-Hot encode our tags
for i in range(len(y_sentences)):
  y_sentences[i] = to_categorical(y_sentences[i], 12)

In [None]:
#Flatten our data, so it's easier to work with.

x_ngrams = []
for sentence in ngrams:
  for ngram in sentence:
    x = []
    for word in ngram:
      for token in word:
        x.append(token)
    x_ngrams.append(x)

y_ngrams = []
for sentence in y_sentences:
  for ngram in sentence:
    y_ngrams.append(ngram)

In [None]:
#We use a BoW(bag of words) method for the word to number conversion. An embedding layer could be used instead if we need a better accuracy.

#Filling our vocab with the tokens.
#Currently even the test data is put into it. It would be better no not include them and make an universal number for the tokens that are not inculded in the vocab.
vocab, index = {}, 1  # start indexing from 1
vocab['PAD'] = 0  # add a padding token
for word in x_ngrams:
  for token in word:
    if token not in vocab:
      vocab[token] = index
      index += 1


vocab_size = len(vocab)
print(vocab)

{'PAD': 0, 'BOS': 1, 'céljuk': 2, 'hogy': 3, 'biztosít': 4, '##sák': 5, 'a': 6, 'korábbi': 7, 'szerzők': 8, 'kil': 9, '##ét': 10, 'hozzájárulás': 11, '##ának': 12, 'mértékét': 13, 'művel': 14, 'kapcsolatos': 15, 'üzleti': 16, 'jogi': 17, 'politikai': 18, 'erkölcsi': 19, 'vagy': 20, 'filozófiai': 21, 'álláspontját': 22, 'ne': 23, 'lehessen': 24, 'elt': 25, '##itk': 26, '##olni': 27, 'megh': 28, '##amis': 29, '##ítani': 30, 'későbbi': 31, 'változtatás': 32, '##ok': 33, 'során': 34, 'EOS': 35, 'Figyelem': 36, 'Mivel': 37, 'nem': 38, 'jogász': 39, 'által': 40, 'készített': 41, 'hiteles': 42, 'fordítás': 43, 'szempontból': 44, 'csak': 45, 'az': 46, 'eredeti': 47, 'angol': 48, 'nyelvű': 49, 'licenc': 50, 'mér': 51, '##vad': 52, '##ó': 53, 'Az': 54, '1': 55, '.': 56, '2': 57, 'változat': 58, 'különbség': 59, 'pedig': 60, 'címen': 61, 'található': 62, 'Jelen': 63, 'célja': 64, 'egy': 65, 'olyan': 66, 'kézikönyv': 67, 'tankönyv': 68, 'eff': 69, '##ajt': 70, '##a': 71, 'írott': 72, 'dokumentum':

In [None]:
#Converting our input into integers, with our BoW vocab.

embedded_x_ngrams = []
for ngram in x_ngrams:
  embedded_x_ngram = []
  for token in ngram:
    embedded_x_ngram.append(vocab[token])
  embedded_x_ngrams.append(embedded_x_ngram)

In [None]:
#Converting the lists into Numpy arrays
sentences = np.array(embedded_x_ngrams)
y_sentences = np.array(y_ngrams)

In [None]:
#Split the data into train, test, and val data
#train : 0.6
#test : 0.2
#val: 0.2
#Since the train test split splits the data into 2 parts we have to call it twice
x_train, x_test, y_train, y_test = train_test_split(sentences, y_sentences, test_size=0.2, random_state=123)
#We need test_size=0.25 since 0.2 / 0.8 == 0.25
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=123)

In [None]:
print(x_train.shape)
print(y_train.shape)

(109188, 28)
(109188, 12)


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
input_shape = (28,)
max_features = vocab_size
inputs = keras.Input(shape=input_shape, dtype="int32")

x = layers.Embedding(max_features, 128)(inputs)

x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)

outputs = layers.Dense(12, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 28)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 28, 128)           2560000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 28, 128)          98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 12)                1548      
                                                                 
Total params: 2,759,180
Trainable params: 2,759,180
Non-tra

In [None]:
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, y_train, batch_size=32, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f40dc305e10>