## Importation of libraries

In [597]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model,Sequential
from keras.layers import Input, LSTM, Dense, SimpleRNN, Embedding
from tensorflow.keras.utils import to_categorical
import copy as c

# Input() is used to instantiate a Keras tensor.
# Dense implements the operation: output = activation(dot(input, kernel) + bias) 
# LSTM : Seq to Seq model 
# Model groups layers into an object with training and inference features 

# from keras.preprocessing.text import one_hot
# from keras.preprocessing.text import text_to_word_sequence

## Data Visualization And Encoding

In [598]:
# Load the datasets

fname_train = "data/finnish-task1-train"
fname_test = "data/finnish-task1-test"
fname_dev = "data/finnish-task1-dev"

dataset_train = np.loadtxt(fname_train,dtype = str)
dataset_test = np.loadtxt(fname_test,dtype = str)
dataset_dev = np.loadtxt(fname_dev,dtype = str)

In [599]:
print(dataset_train)
print(dataset_train.shape)

# print(np.where(dataset_ttrain == 'Ã¤Ã¤kkÃ¶stÃ¤Ã¤' )[0])

[['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,polar=POS,mood=IMP,tense=PRS,per=3,num=SG'
  'Ã¤Ã¤kkÃ¶stÃ¤kÃ¶Ã¶n']
 ['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,voice=ACT,aspect=PROSP'
  'Ã¤Ã¤kkÃ¶stÃ¤mÃ¤isillÃ¤Ã¤n']
 ['aalloittaisuus' 'pos=N,case=ON+ESS,num=PL' 'aalloittaisuuksilla']
 ...
 ['zoonoosi' 'pos=N,case=PRIV,num=SG' 'zoonoositta']
 ['zsaari' 'pos=N,case=IN+LAT,num=PL' 'zsaareihin']
 ['zumbata' 'pos=V,polar=POS,mood=POT,tense=PRS,per=2,num=PL'
  'zumbannette']]
(12693, 3)


In [600]:
print(dataset_test)
print(dataset_test.shape)
# print(np.where(test == 'Ã¤Ã¤kkÃ¶stÃ¤Ã¤' )[0])
# print(test[21533])

[['alkeiskoppi' 'pos=N,case=NOM,num=SG' 'alkeiskoppi']
 ['lenkkitossut' 'pos=N,case=ON+ESS,num=PL' 'lenkkitossuilla']
 ['baritonitorvi' 'pos=N,case=PRIV,num=SG' 'baritonitorvetta']
 ...
 ['katkeroida' 'pos=V,polar=POS,mood=IND,tense=PRS,per=3,num=SG'
  'katkeroi']
 ['paarmalintu' 'pos=N,case=TRANS,num=PL' 'paarmalinnuiksi']
 ['malisiÃ¶Ã¶si' 'pos=ADJ,case=IN+ABL,num=SG' 'malisiÃ¶Ã¶sistÃ¤']]
(23633, 3)


In [601]:
print(dataset_dev)
print(dataset_dev.shape)

[['aakkosto' 'pos=N,case=NOM,num=PL' 'aakkostot']
 ['aallottaa' 'pos=V,mood=PURP,voice=ACT' 'aallottaakseen']
 ['aaltoluku' 'pos=N,case=FRML,num=SG' 'aaltolukuna']
 ...
 ['ystÃ¤vÃ¤piiri' 'pos=N,case=ON+ABL,num=SG' 'ystÃ¤vÃ¤piiriltÃ¤']
 ['ytimekÃ¤s' 'pos=ADJ,case=ACC,num=SG' 'ytimekkÃ¤Ã¤n']
 ['zombi' 'pos=N,case=IN+ABL,num=PL' 'zombeista']]
(1598, 3)


In [602]:
## Definition of encoding functions

# Return the dictionary for a given list
def list_to_dict(data):  
    dic = {}
    for x in data:
        dic[x] = dic.get(x, len(dic))
    return dic

# Return the encoded array
def encode(data):
    dics = []
    for i in range(data.shape[1]):
        dic = list_to_dict(data[:,i])
        dics.append(dic)
        for j in range(len(data[:,i])):
            data[:,i][j] = dic[data[:,i][j]]
            
    return data.astype(np.int),dics

# def concatenate(data):
#     return np.transpose(np.asarray([np.core.defchararray.add(data[:,0], data[:,1]),data[:,0]]))

In [603]:
# Encoding 
train,dics_train = encode(dataset_train) 
# test,dics_test = encode(dataset_ttest)
# dev,dics_dev = encode(dataset_tdev)

In [604]:
print(train)
print(train.shape)

[[    0     0     0]
 [    0     1     1]
 [    1     2     2]
 ...
 [ 9853    21 12675]
 [ 9854    24 12676]
 [ 9855    43 12677]]
(12693, 3)


In [605]:
x_train = train[:,0:2]
y_train = train[:,2]

In [606]:
print(x_train)
print(x_train.shape)

[[   0    0]
 [   0    1]
 [   1    2]
 ...
 [9853   21]
 [9854   24]
 [9855   43]]
(12693, 2)


In [607]:
print(y_train)
print(y_train.shape)

[    0     1     2 ... 12675 12676 12677]
(12693,)


**Whenever we’re working with categorical data, we don’t want to leave it as integers because the model will interpreted the samples with a higher number as having more significance. to_categorical is quick and dirty way of encoding the data.**

In [608]:
# x_train = to_categorical(train)
# x_test = to_categorical(test)
# x_dev = to_categorical(dev)

# y_train = to_categorical(train)
# y_test = to_categorical(test)
# y_dev = to_categorical(dev)

# print(x_train)

In [609]:
# Define meta-parameter

epochs = 30
batch_size = 500

In [610]:
# Update the encoding dictionaries (word-context-target)

def update_dics(data,dics):
    new_dics = []
    for i in range((data.shape[1])):
        dic = dics[i]
        for el in data[:,i]:
            if el not in list(dic.keys()): 
                dic[el] = max(list(dic.values())) + 1 # add word to dictionary
                
        new_dics.append(dic)
        
    return new_dics

# Encode data with given dictionarries

def encode_with_dict(data,dics):
    copy = c.deepcopy(data)
    for i in range((data.shape[1])):
        dic = dics[i]
        copy[:,i] = np.asarray([dic[el] for el in data[:,i]]) 
    return copy.astype(np.int)

In [611]:
# Update dictionarry
dics = update_dics(dataset_test,dics_train)
dics = update_dics(dataset_dev,dics)

In [612]:
# Encoding test and dev
test = encode_with_dict(dataset_test,dics)
dev = encode_with_dict(dataset_dev,dics)

In [613]:
print(test)

[[  289    19 12678]
 [ 9856     2 12679]
 [ 9857    21 12680]
 ...
 [ 3365    35 36272]
 [18939    41 36273]
 [14794    88 36274]]


In [614]:
print(dev)

[[17980     9 36275]
 [11854    59 36276]
 [10927    37 36277]
 ...
 [19296    14 37848]
 [ 9841    48 37849]
 [19297    18 37850]]


In [615]:
x_test = test[:,0:2]
x_dev = dev[:,0:2]
y_test = test[:,2]
y_dev = dev[:,2]


In [616]:
max_words = len(list(dics[0].keys()))    # size of vocabulary
max_len =  len(list(dics[-1].keys()))    # lenght of input sequence

In [617]:
print(x_test)

[[  289    19]
 [ 9856     2]
 [ 9857    21]
 ...
 [ 3365    35]
 [18939    41]
 [14794    88]]


In [618]:
model_lstm = Sequential()

In [619]:
model_lstm.add(Embedding(input_dim = max_words, output_dim = 100,input_length = 2))
# model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(100)) #dropout = 0.3,recurrent_dropout = 0.3
model_lstm.add(Dense(100, activation = 'relu'))
# model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(max_len, activation = 'softmax'))

In [620]:
model_lstm.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

In [621]:
history = model_lstm.fit(
    x_train,
    y_train,
    epochs = epochs,
    batch_size = batch_size
)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [622]:
def prediction(data,size = 20):
    return np.argmax(model_lstm.predict(data[:size]),axis = 1)


# function to return key for any value
def get_key(val,my_dict):
    for key, value in my_dict.items():
         if val == value:
             return key
    return "key doesn't exist"

# Return words by specifing the index and the dictionary -> decoding
def return_pred(y_pred_index,dic):
    return np.asarray([get_key(index,dic) for index in y_pred_index])

def acc(y_pred,y):
    return np.sum(y == y_pred)/len(y)



In [623]:
# Decoding and Compute train accuracy 

size = 20
dic_x = dics[0]
dic_y = dics[-1]
y_pred_index = prediction(x_train,size = size)
y_pred = return_pred(y_pred_index,dic_x)
y = return_pred(y_train,dic_x)

print("Training acuracy: ",acc(y_pred,y[:size]))

Training acuracy:  0.95


In [624]:
# plt.clf()
# loss = history.history['loss']
# val_loss = history.history['val_loss']
# epochs = range(1, len(loss) + 1)
# plt.plot(epochs, loss, 'g', label='Training loss')
# plt.plot(epochs, val_loss, 'y', label='Validation loss')
# plt.title('Training and validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [625]:
print(x_train)
print(x_test)

[[   0    0]
 [   0    1]
 [   1    2]
 ...
 [9853   21]
 [9854   24]
 [9855   43]]
[[  289    19]
 [ 9856     2]
 [ 9857    21]
 ...
 [ 3365    35]
 [18939    41]
 [14794    88]]


In [626]:
# Decoding and Compute test accuracy 

size = 20
dic_x = dics[0]
dic_y = dics[-1]
y_pred_index = prediction(x_test,size)
y_pred = return_pred(y_pred_index,dic_x)
y = return_pred(y_test,dic_x)

print("Test acuracy: ",acc(y_pred,y[:size]))

Test acuracy:  0.0


In [632]:
print(y.tolist().count("key doesn't exist"))

0


In [631]:
print(len(y))

23633
