## Importation of libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model,Sequential
from keras.layers import Input, LSTM, Dense, SimpleRNN, Embedding

# Input() is used to instantiate a Keras tensor.
# Dense implements the operation: output = activation(dot(input, kernel) + bias) 
# LSTM : Seq to Seq model 
# Model groups layers into an object with training and inference features 

# from keras.preprocessing.text import one_hot
# from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


## Data Visualization And Encoding

In [3]:
# Load the datasets

fname_train = "data/finnish-task1-train"
fname_test = "data/finnish-task1-test"
fname_dev = "data/finnish-task1-dev"

train = np.loadtxt(fname_train,dtype = str)
test = np.loadtxt(fname_test,dtype = str)
dev = np.loadtxt(fname_dev,dtype = str)

In [4]:
print(train)
print(train.shape)

[['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,polar=POS,mood=IMP,tense=PRS,per=3,num=SG'
  'Ã¤Ã¤kkÃ¶stÃ¤kÃ¶Ã¶n']
 ['Ã¤Ã¤kkÃ¶stÃ¤Ã¤' 'pos=V,voice=ACT,aspect=PROSP'
  'Ã¤Ã¤kkÃ¶stÃ¤mÃ¤isillÃ¤Ã¤n']
 ['aalloittaisuus' 'pos=N,case=ON+ESS,num=PL' 'aalloittaisuuksilla']
 ...
 ['zoonoosi' 'pos=N,case=PRIV,num=SG' 'zoonoositta']
 ['zsaari' 'pos=N,case=IN+LAT,num=PL' 'zsaareihin']
 ['zumbata' 'pos=V,polar=POS,mood=POT,tense=PRS,per=2,num=PL'
  'zumbannette']]
(12693, 3)


In [5]:
print(test)
print(test.shape)

[['alkeiskoppi' 'pos=N,case=NOM,num=SG' 'alkeiskoppi']
 ['lenkkitossut' 'pos=N,case=ON+ESS,num=PL' 'lenkkitossuilla']
 ['baritonitorvi' 'pos=N,case=PRIV,num=SG' 'baritonitorvetta']
 ...
 ['katkeroida' 'pos=V,polar=POS,mood=IND,tense=PRS,per=3,num=SG'
  'katkeroi']
 ['paarmalintu' 'pos=N,case=TRANS,num=PL' 'paarmalinnuiksi']
 ['malisiÃ¶Ã¶si' 'pos=ADJ,case=IN+ABL,num=SG' 'malisiÃ¶Ã¶sistÃ¤']]
(23633, 3)


In [6]:
print(dev)
print(dev.shape)

[['aakkosto' 'pos=N,case=NOM,num=PL' 'aakkostot']
 ['aallottaa' 'pos=V,mood=PURP,voice=ACT' 'aallottaakseen']
 ['aaltoluku' 'pos=N,case=FRML,num=SG' 'aaltolukuna']
 ...
 ['ystÃ¤vÃ¤piiri' 'pos=N,case=ON+ABL,num=SG' 'ystÃ¤vÃ¤piiriltÃ¤']
 ['ytimekÃ¤s' 'pos=ADJ,case=ACC,num=SG' 'ytimekkÃ¤Ã¤n']
 ['zombi' 'pos=N,case=IN+ABL,num=PL' 'zombeista']]
(1598, 3)


In [7]:
## Definition of encoding functions

# Return the dictionary for a given list
def list_to_dict(data):  
    dic = {}
    for x in data:
        dic[x] = dic.get(x, len(dic))
    return dic

# Return the encoded array
def encode(data):
    dics = []
    for i in range(data.shape[1]):
        dic = list_to_dict(data[:,i])
        dics.append(dic)
        for j in range(len(data[:,i])):
            data[:,i][j] = dic[data[:,i][j]]
            
    return data,dics

In [8]:
# We also store the dictionary to do the decoding operation at the end
train,dics_train = encode(train) 
test,dics_test = encode(test)
dev,dics_dev = encode(dev)

In [9]:
x_train = train[:,0:2]
x_test = test[:,0:2]
x_dev = dev[:,0:2]

y_train = train[:,2]
y_test = test[:,2]
y_dev = dev[:,2]

In [10]:
print(x_train)
print(x_train.shape)

[['0' '0']
 ['0' '1']
 ['1' '2']
 ...
 ['9853' '21']
 ['9854' '24']
 ['9855' '43']]
(12693, 2)


In [44]:
print(y_train)
print(y_train.shape)

['0' '1' '2' ... '12675' '12676' '12677']
(12693,)


In [17]:
num_encoder_tokens = len(x_train[:,0])
latent_dim = 50

In [18]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [19]:
num_decoder_tokens = latent_dim

In [20]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [21]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [23]:
# Define meta-parameter

epochs = 200
batch_size = 100

In [26]:
# Run training

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([['0', '0'],
       ['0', '1'],
       ['1', '2'],
       ...,
       ['9853', '21'],
       ['9854', '24'],
       ['9855', '43']], dtype='<U48')]...