## Importation of libraries

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# from keras.preprocessing.text import one_hot
# from keras.preprocessing.text import text_to_word_sequence

## Data Visualization And Encoding

In [36]:
# Load the datasets

fname_train = "data/finnish-task1-train"
fname_test = "data/finnish-task1-test"
fname_dev = "data/finnish-task1-dev"

train = np.loadtxt(fname_train,dtype = str)
test = np.loadtxt(fname_test,dtype = str)
dev = np.loadtxt(fname_dev,dtype = str)

In [37]:
print(train)
print(train.shape)

[['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,polar=POS,mood=IMP,tense=PRS,per=3,num=SG'
  'Ã¤Ã¤kkÃ¶stÃ¤kÃ¶Ã¶n']
 ['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,voice=ACT,aspect=PROSP'
  'Ã¤Ã¤kkÃ¶stÃ¤mÃ¤isillÃ¤Ã¤n']
 ['aalloittaisuus' 'pos=N,case=ON+ESS,num=PL' 'aalloittaisuuksilla']
 ...
 ['zoonoosi' 'pos=N,case=PRIV,num=SG' 'zoonoositta']
 ['zsaari' 'pos=N,case=IN+LAT,num=PL' 'zsaareihin']
 ['zumbata' 'pos=V,polar=POS,mood=POT,tense=PRS,per=2,num=PL'
  'zumbannette']]
(12693, 3)


In [38]:
print(test)
print(test.shape)

[['alkeiskoppi' 'pos=N,case=NOM,num=SG' 'alkeiskoppi']
 ['lenkkitossut' 'pos=N,case=ON+ESS,num=PL' 'lenkkitossuilla']
 ['baritonitorvi' 'pos=N,case=PRIV,num=SG' 'baritonitorvetta']
 ...
 ['katkeroida' 'pos=V,polar=POS,mood=IND,tense=PRS,per=3,num=SG'
  'katkeroi']
 ['paarmalintu' 'pos=N,case=TRANS,num=PL' 'paarmalinnuiksi']
 ['malisiÃ¶Ã¶si' 'pos=ADJ,case=IN+ABL,num=SG' 'malisiÃ¶Ã¶sistÃ¤']]
(23633, 3)


In [39]:
print(dev)
print(dev.shape)

[['aakkosto' 'pos=N,case=NOM,num=PL' 'aakkostot']
 ['aallottaa' 'pos=V,mood=PURP,voice=ACT' 'aallottaakseen']
 ['aaltoluku' 'pos=N,case=FRML,num=SG' 'aaltolukuna']
 ...
 ['ystÃ¤vÃ¤piiri' 'pos=N,case=ON+ABL,num=SG' 'ystÃ¤vÃ¤piiriltÃ¤']
 ['ytimekÃ¤s' 'pos=ADJ,case=ACC,num=SG' 'ytimekkÃ¤Ã¤n']
 ['zombi' 'pos=N,case=IN+ABL,num=PL' 'zombeista']]
(1598, 3)


In [40]:
## Definition of encoding functions

# Return the dictionary for a given list
def list_to_dict(data):  
    dic = {}
    for x in data:
        dic[x] = dic.get(x, len(dic))
    return dic

# Return the encoded array
def encode(data):
    dics = []
    for i in range(data.shape[1]):
        dic = list_to_dict(data[:,i])
        dics.append(dic)
        for j in range(len(data[:,i])):
            data[:,i][j] = dic[data[:,i][j]]
            
    return data,dics

In [41]:
train,dics_train = encode(train)
test,dics_test = encode(test)
dev,dics_dev = encode(dev)

In [42]:
x_train = train[:,0:2]
x_test = test[:,0:2]
x_dev = dev[:,0:2]

y_train = train[:,2]
y_test = test[:,2]
y_dev = dev[:,2]

In [43]:
print(x_train)
print(x_train.shape)

[['0' '0']
 ['0' '1']
 ['1' '2']
 ...
 ['9853' '21']
 ['9854' '24']
 ['9855' '43']]
(12693, 2)


In [44]:
print(y_train)
print(y_train.shape)

['0' '1' '2' ... '12675' '12676' '12677']
(12693,)


In [46]:
print(dics_train[0]) # Display the encoding dictionary used for first column, we store it to do the decoding operation

{'Ã¤Ã¤kkÃ¶stÃ¤Ã¤': 0, 'aalloittaisuus': 1, 'aallokas': 2, 'aalloppi': 3, 'aaltoilla': 4, 'aaltomuoto': 5, 'aaltopelti': 6, 'aaltopituus': 7, 'aalto': 8, 'aaltosulkumerkki': 9, 'aamiaishetki': 10, 'aamiaistaa': 11, 'aamuhetki': 12, 'aamukahvi': 13, 'aamuntorkku': 14, 'aamupuku': 15, 'aamurusko': 16, 'aamutorkku': 17, 'aamutossu': 18, 'Ã¤Ã¤nekÃ¤s': 19, 'Ã¤Ã¤nennopeus': 20, 'Ã¤Ã¤nestyskoppi': 21, 'Ã¤Ã¤nettÃ¶myys': 22, 'Ã¤Ã¤nilevystÃ¶': 23, 'Ã¤Ã¤nioikeus': 24, 'Ã¤Ã¤nittÃ¤Ã¤': 25, 'Ã¤Ã¤nivalli': 26, 'Ã¤Ã¤nnÃ¤htÃ¤Ã¤': 27, 'Ã¤Ã¤nnellÃ¤': 28, 'Ã¤Ã¤ntÃ¶': 29, 'aaprotti': 30, 'Ã¤Ã¤rellisyys': 31, 'Ã¤Ã¤rivasemmisto': 32, 'aarni': 33, 'aarporata': 34, 'aarrelÃ¶ytÃ¶': 35, 'aasianjokipÃ¤Ã¤sky': 36, 'aasiankilpikierto': 37, 'aasiankiuru': 38, 'aasianmarmorimurri': 39, 'aasiansieppokerttu': 40, 'aasiantÃ¶rmÃ¤pÃ¤Ã¤sky': 41, 'aasianuuttukyyhky': 42, 'aasianvuoripeippo': 43, 'aasinmaksaruoho': 44, 'aasi': 45, 'aataminpuku': 46, 'aatelisarvo': 47, 'aatelisneito': 48, 'aatelissuku': 49, 'aatella': 50, 'aat