<a href="https://colab.research.google.com/github/Bollash/Entity-recognition-hw/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import urllib.request
import gzip
import shutil
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
#Downloading the dataset.
url = "http://hlt.sztaki.hu/resources/hunnerwiki/huwiki.1.ner.tsv.gz"
local_file = "data.tsv.gz"
urllib.request.urlretrieve(url, local_file)

('data.tsv.gz', <http.client.HTTPMessage at 0x7f6d8d665250>)

In [None]:
#Decompressing the data
with gzip.open(local_file, 'rb') as f_in:
    with open('file.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
#Loading the data into data_set. There are lines that produce errors. We filter them out by using the third parameter
data_set = pd.read_csv('file.tsv', sep='\t', error_bad_lines=False)

b'Skipping line 121529: expected 6 fields, saw 16498\nSkipping line 121533: expected 6 fields, saw 10\nSkipping line 121537: expected 6 fields, saw 8198\n'


In [None]:
data_set

Unnamed: 0,A,text,0,ART,a,O
0,céljuk,text,0,NOUN<POSS<PLUR>>,cél,O
1,",",text,0,PUNCT,",",O
2,hogy,text,0,CONJ,hogy,O
3,biztosítsák,text,0,VERB<SUBJUNC-IMP><PLUR><DEF>,biztosít,O
4,",",text,0,PUNCT,",",O
...,...,...,...,...,...,...
2237028,280,text,0,NUM,280,O
2237029,km/h,text,0,NOUN,km/h,O
2237030,a,text,0,ART,a,O
2237031,végsebbessége,text,0,NOUN<POSS>,végsebbesség,O


In [None]:
#We only need the lemmas and the tags. We believe we can extract the entities with the lemmas, and this way we don't have to tokenize, meaning that we have fewer input tokens.
smaller = data_set[['a','O']]
#Using a fraction of the original data since there are 2.3 million elements.
smaller = smaller[0:500000]

In [None]:
smaller

Unnamed: 0,a,O
0,cél,O
1,",",O
2,hogy,O
3,biztosít,O
4,",",O
...,...,...
499995,vesz,O
499996,fel,O
499997,egy,O
499998,dal,O


In [None]:
#Some lines are NaN. We filter them out this way.
smaller = smaller[smaller['O'].notnull()]

In [None]:
#To convert the tags into numbers we'll use a dictionary. Currently there is no 'PAD', but we shall add them later.
d = {
    'B-LOC' : 0,
    'B-MISC' : 1,
    'B-ORG' : 2,
    'B-PER' : 3,
    'I-LOC' : 4,
    'I-MISC' : 5,
    'I-ORG' : 6,
    'I-PER' : 7,
    'O' : 8,
    'PAD' : 9
}
#Converting the tags
for idx, row in smaller.iterrows():
  row[1] = d[row[1]]

In [None]:
#Reconstructing the sentences. Every "." marks the beginning of a new sentence. This might erroneously produce sentences for "...". Should that prove to be a significant issue, we shall change it.
sentences = []
y_sentences = []
sent = []
y_sent = []
for idx, row in smaller.iterrows():
  sent.append(row[0])
  y_sent.append(row[1])
  if row[0] is '.':
    sentences.append(sent)
    sent = []
    y_sentences.append(y_sent)
    y_sent = []

In [None]:
sentences

[['cél',
  ',',
  'hogy',
  'biztosít',
  ',',
  'hogy',
  'a',
  'korai',
  'szerző',
  'kilét',
  ',',
  'hozzájárulás',
  'mérték',
  ',',
  'a',
  'mű',
  'kapcsolatos',
  'üzleti',
  ',',
  'jogi',
  ',',
  'politikai',
  ',',
  'erkölcsi',
  'vagy',
  'filozófiai',
  'álláspont',
  'ne',
  'lehet',
  'eltitkol',
  'vagy',
  'meghamisít',
  'a',
  'későbbi',
  'változtatás',
  'során',
  '.'],
 ['figyelem',
  '!',
  'mivel',
  'nem',
  'jogász',
  'által',
  'készít',
  'hiteles',
  'fordítás',
  ',',
  'jogi',
  'szempont',
  'csak',
  'a',
  'eredeti',
  ',',
  'angol',
  'nyelvű',
  'licenc',
  'a',
  'mérvadó',
  '.'],
 ['a',
  'eredeti',
  '1',
  'változat',
  'a',
  'a',
  'különbség',
  'pedig',
  'a',
  'cím',
  'található',
  '.'],
 ['jelen',
  'licenc',
  'cél',
  'egy',
  'olyan',
  'kézikönyv',
  ',',
  'tankönyv',
  ',',
  'vagy',
  'effajta',
  'írott',
  'dokumentum',
  'megalkotás',
  ',',
  'mely',
  'a',
  'szó',
  'szoros',
  'értelem',
  '„',
  'szabad',
  '”',

In [None]:
#We need to pad the sentences, so that each input is the same length.
max_len = len(max(sentences, key=len))
#Pad sentences and y
for sentence in sentences:
  for _ in range(max_len - len(sentence)):
    sentence.append('PAD')
for y in y_sentences:
  for _ in range(max_len - len(y)):
    y.append(9) #9 is our pad symbol

In [None]:
#One-Hot encode our tags
for i in range(len(y_sentences)):
  y_sentences[i] = to_categorical(y_sentences[i], 10)

In [None]:
#Split the data into train, test, and val data
#train : 0.6
#test : 0.2
#val: 0.2
#Since the train test split splits the data into 2 parts we have to call it twice
x_train, x_test, y_train, y_test = train_test_split(sentences, y_sentences, test_size=0.2, random_state=123)
#We need test_size=0.25 since 0.2 / 0.8 == 0.25
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=123)

In [None]:
#As you can see, our input data is a 128 length list, containing the tokens that make up the language at the beginning.
#They're followed by a rather large amount of "PAD" tokens, which are necessary to ensure that every one of our inputs is the same length.
#We could use this input with a Bag of Words model (where every word is represented by a number, and in the first stage we transform the words into the numbers assigned to them), but an encoder layer will be more suitable
print(x_train[0])

#Our y data consists of 128 10 length vector(it's still a list), that's One-Hot encoded.
print(y_train[0])

['a', 'viszonylag', 'vékony', 'profilú', '(', 'nagysebesség', ')', 'hordfelület', '(', 'szárny', ')', 'belépőéle', 'a', 'teljes', 'fesztáv', 'mentén', 'lehajt', ',', 'a', 'kilépőél', 'kétharmada', 'egymás', 'csúszik', 'kétrészes', 'fékszárny', 'van', 'ellát', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0.