In [None]:
import os
import numpy as np
import pandas as pd
import sys
import csv
import random
import tensorflow as tf
from keras import optimizers
from keras.callbacks import ModelCheckpoint, Callback
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, dot, multiply, Lambda
from keras.models import Model, Sequential, load_model
from keras.layers.wrappers import TimeDistributed
from keras.utils import to_categorical
from keras import backend as K
from random import shuffle
from sklearn.utils import class_weight
from datetime import datetime
random.seed(999)

In [None]:
from data_generator_generalization import *

## Generate data

In [None]:
input_train, output_train, input_test, output_test = gen_hier_onehot(train_size = 100, gen_type = 'ext', random_training = False)
#input_train, output_train, input_test, output_test = gen_hier_embful(train_size = 500, gen_type = 'ext')
#input_train, output_train, input_test, output_test = gen_hier_embred(train_size = 500, gen_type = 'ext')

## Run model

In [None]:
y_int = [y.argmax() for y in output_train]
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_int), y_int)

# for regular training
class_weights = {0: 0, 1: 0, 2: class_weights[0], 3: class_weights[1], 4: class_weights[2], 
             5: class_weights[3], 6: class_weights[4], 7: class_weights[5], 8: class_weights[6]}

# for training on pseudorandom data
#class_weights = {0: 0, 1: 0, 2: 0, 3: class_weights[0], 4: class_weights[1], 5: class_weights[2], 
#             6: class_weights[3], 7: class_weights[4], 8: class_weights[5]}

#vec_in = Input(shape=(4, 74), dtype='float32', name='vec_in') # one-hot
vec_in = Input(shape=(4, 364), dtype='float32', name='vec_in') # full embeddings
#vec_in = Input(shape=(4, 74), dtype='float32', name='vec_in') # reduced embeddings
lstm = LSTM(100, return_sequences=False, name='lstm1')(vec_in)
target_pos = Dense(9, name='target_pos', activation='softmax')(lstm)
model = Model(inputs=vec_in,outputs=target_pos)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(input_train,output_train, steps_per_epoch=100, epochs=50,verbose=1,class_weight=class_weights)

## Save predictions on test set (one-hot)

In [None]:
predictions = model.predict(input_test)
train_size = 500

for runs in range(len(input_test)):

    df_pic = pd.DataFrame([[['..'],['..'],['..'],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..'],['..']]], 
                            columns= ['ordinal', 'color', 'shape', 
                                      '1', '2', '3', '4', '5', '6', '7', '8','o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                      'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8','p9', 'index', 'max', 'correctness','answer_item', 'train_size'])
    
    #gets the index of the highest value
    i_answer = list(predictions[runs]).index(max(predictions[runs]))+1 # index of max value
    p_answer = max(predictions[runs]) # max value
    
    df_pic['index'][0] = i_answer
    df_pic['max'][0] = p_answer
     
    # load input and split into phrase and picture
    ordnum = input_test[runs][0]
    colnum = input_test[runs][1]
    shapenum = input_test[runs][2]
    
    ordnum = list(ordnum)
    colnum = list(colnum)
    shapenum = list(shapenum)
    
    ## the phrase
    
    # ordinal of target phrase
    if ordnum[:6] == [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]:
        ordinal = 'seventh'
        lin_ans = 1
    elif ordnum[:6] == [0.0, 1.0, 0.0, 0.0, 0.0, 0.0]:
        ordinal = 'second'
        lin_ans = 2
    elif ordnum[:6] == [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]:
        ordinal = 'third'
        lin_ans = 3
    elif ordnum[:6] == [0.0, 0.0, 0.0, 1.0, 0.0, 0.0]:
        ordinal = 'fourth'
        lin_ans = 4
    elif ordnum[:6] == [0.0, 0.0, 0.0, 0.0, 1.0, 0.0]:
        ordinal = 'fifth'
        lin_ans = 5
    elif ordnum[:6] == [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]:
        ordinal = 'sixth'
        lin_ans = 5
    else:
        ordinal = 'ordinal?'

    #check what kind of answer the model produced
    if i_answer == list(output_test[runs]).index(max(output_test[runs]))+1:
        if output_test[runs][8] == 1:
            synt_ans = 'absence correct'
        else:
            synt_ans = 'hierarchical'
    else:
        synt_ans = 'error'
 
    # color of target phrase
    if colnum[:9] == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]:
        color = 'blue'
    elif colnum[:9] == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]:
        color = 'green'
    elif colnum[:9] == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]:
        color = 'red'
    
    # shape of target phrase
    if shapenum[:10] == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]:
        shape = 'ball'
    else:
        shape = 'shape?'
    
    ## the picture

    # load input and split into phrase and picture
    picnum = input_test[runs][3]
    picnum = picnum[10:]

    smallrun = 0
    for elements in range(0,64,8):
        smallrun += 1

        colrange1 = elements
        colrange2 = elements+4
        
        shaperange1 = elements+4
        shaperange2 = elements+8

        colnum = list(picnum[colrange1:colrange2])
        if colnum == [0.25, 0.0, 0.0, 0.0]:
            color_pic = 'blue'
        elif colnum == [0.0, 0.25, 0.0, 0.0]:
            color_pic = 'green'
        elif colnum == [0.0, 0.0, 0.25, 0.0]:
            color_pic = 'red'

        shapenum = list(picnum[shaperange1:shaperange2])
        if shapenum == [0.0, 0.0, 0.0, 0.25]:
            shape_pic = 'ball'
        else:
            shape_pic = 'shape?'    

        props = [color_pic, shape_pic]

        df_pic[str(smallrun)][0] = props
    
    df_pic['ordinal'][0] = ordinal
    df_pic['color'][0] = color
    df_pic['shape'][0] = shape
    df_pic['o1'][0] = output_test[runs][0]    
    df_pic['o2'][0] = output_test[runs][1] 
    df_pic['o3'][0] = output_test[runs][2] 
    df_pic['o4'][0] = output_test[runs][3] 
    df_pic['o5'][0] = output_test[runs][4] 
    df_pic['o6'][0] = output_test[runs][5] 
    df_pic['o7'][0] = output_test[runs][6] 
    df_pic['o8'][0] = output_test[runs][7] 
    df_pic['o9'][0] = output_test[runs][8] 
    df_pic['p1'][0] = predictions[runs][0]    
    df_pic['p2'][0] = predictions[runs][1] 
    df_pic['p3'][0] = predictions[runs][2] 
    df_pic['p4'][0] = predictions[runs][3] 
    df_pic['p5'][0] = predictions[runs][4] 
    df_pic['p6'][0] = predictions[runs][5] 
    df_pic['p7'][0] = predictions[runs][6] 
    df_pic['p8'][0] = predictions[runs][7]
    df_pic['p9'][0] = predictions[runs][8]
    df_pic['correctness'][0] = synt_ans
    if i_answer < 9:
        df_pic['answer_item'][0] = df_pic[str(i_answer)][0] 
    else:
        df_pic['answer_item'][0] = 'target absent'
    df_pic['train_size'][0] = train_size 
   
    if runs == 0:
        phrase_and_pic = pd.DataFrame(df_pic, columns= ['ordinal', 'color', 'shape', '1', '2', '3', '4', '5', '6', '7', '8',
                                                          'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                                          'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8', 'p9','index', 'max', 'correctness', 'answer_item', 'train_size'])
    else:
        phrase_and_pic = phrase_and_pic.append(df_pic, ignore_index=True)
    
    
print(phrase_and_pic)
#phrase_and_pic.to_csv('data/generalization/extrapolation/h_gen1_pred_500thirdred_onehot.csv', index=False)
#phrase_and_pic.to_csv('data/generalization/interpolation/h_gen2_pred_500thirdred_onehot.csv', index=False)

## Save predictions on test set (full embeddings)

In [None]:
predictions = model.predict(input_test)
train_size = 500

# load dimensionality-reduced embeddings
myembeddings = pd.read_csv('word2vec/embeddings.csv', header = 0)
myembeddings = myembeddings['embedding']
# convert the embeddings from list to string of floats
second = np.array(np.matrix(myembeddings[0])).ravel()
third = np.array(np.matrix(myembeddings[1])).ravel()
fourth = np.array(np.matrix(myembeddings[2])).ravel()
fifth = np.array(np.matrix(myembeddings[3])).ravel()
sixth = np.array(np.matrix(myembeddings[4])).ravel()
seventh = np.array(np.matrix(myembeddings[5])).ravel()
blue = np.array(np.matrix(myembeddings[6])).ravel()
green = np.array(np.matrix(myembeddings[7])).ravel()
red = np.array(np.matrix(myembeddings[8])).ravel()
ball = np.array(np.matrix(myembeddings[9])).ravel()

for runs in range(len(input_test)):

    df_pic = pd.DataFrame([[['..'],['..'],['..'],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..']]], 
                            columns= ['ordinal', 'color', 'shape', 
                                      '1', '2', '3', '4', '5', '6', '7', '8','o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                      'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8', 'index', 'max', 'correctness','answer_item', 'train_size'])
    
    #gets the index of the highest value
    i_answer = list(predictions[runs]).index(max(predictions[runs]))+1 # index of max value
    p_answer = max(predictions[runs]) # max value
    
    df_pic['index'][0] = i_answer
    df_pic['max'][0] = p_answer
     
    # load input and split into phrase and picture
    ordnum = input_test[runs][0]
    colnum = input_test[runs][1]
    shapenum = input_test[runs][2]
    
    ordnum = list(ordnum)
    colnum = list(colnum)
    shapenum = list(shapenum)
    
    ## the phrase
    
    # save what the ordinal was
    if (ordnum[:300] == second).all() == True:
        ordinal = 'second'
        lin_ans = 2
    elif (ordnum[:300] == third).all() == True:
        ordinal = 'third'
        lin_ans = 3
    elif (ordnum[:300] == fourth).all() == True:
        ordinal = 'fourth'
        lin_ans = 4
    elif (ordnum[:300] == fifth).all() == True:
        ordinal = 'fifth'
        lin_ans = 5
    elif (ordnum[:300] == sixth).all() == True:
        ordinal = 'sixth'
        lin_ans = 6
    elif (ordnum[:300] == seventh).all() == True:
        ordinal = 'seventh'
        lin_ans = 7
    else:
        ordinal = 'ordinal?'

    #check what kind of answer the model produced
    if i_answer == list(output_test[runs]).index(max(output_test[runs]))+1:
        if output_test[runs][8] == 0:
            synt_ans = 'hierarchical'
        else:
            synt_ans = 'absence correct'
    else:
        synt_ans = 'error'
     
    # color of target phrase
    if (colnum[:300] == blue).all() == True:
        color = 'blue'
    elif (colnum[:300] == green).all() == True:
        color = 'green'
    elif (colnum[:300] == red).all() == True:
        color = 'red'
    else:
        color = 'color?'
 
    # shape of target phrase
    if (shapenum[:300] == ball).all() == True:
        shape = 'ball'
    else:
        shape = 'shape?'
    
    ## the picture

    # load input and split into phrase and picture
    picnum = input_test[runs][3]
    picnum = picnum[300:]

    smallrun = 0
    for elements in range(0,64,8):
        smallrun += 1

        colrange1 = elements
        colrange2 = elements+4
        
        shaperange1 = elements+4
        shaperange2 = elements+8
        
        colnum = list(picnum[colrange1:colrange2])
        if colnum == [0.25, 0.0, 0.0, 0.0]:
            color_pic = 'blue'
        elif colnum == [0.0, 0.25, 0.0, 0.0]:
            color_pic = 'green'
        elif colnum == [0.0, 0.0, 0.25, 0.0]:
            color_pic = 'red'

        shapenum = list(picnum[shaperange1:shaperange2])
        if shapenum == [0.0, 0.0, 0.0, 0.25]:
            shape_pic = 'ball'
        else:
            shape_pic = 'shape?'    

        props = [color_pic, shape_pic]

        df_pic[str(smallrun)][0] = props
    
    df_pic['ordinal'][0] = ordinal
    df_pic['color'][0] = color
    df_pic['shape'][0] = shape
    df_pic['o1'][0] = output_test[runs][0]    
    df_pic['o2'][0] = output_test[runs][1] 
    df_pic['o3'][0] = output_test[runs][2] 
    df_pic['o4'][0] = output_test[runs][3] 
    df_pic['o5'][0] = output_test[runs][4] 
    df_pic['o6'][0] = output_test[runs][5] 
    df_pic['o7'][0] = output_test[runs][6] 
    df_pic['o8'][0] = output_test[runs][7] 
    df_pic['o9'][0] = output_test[runs][8] 
    df_pic['p1'][0] = predictions[runs][0]    
    df_pic['p2'][0] = predictions[runs][1] 
    df_pic['p3'][0] = predictions[runs][2] 
    df_pic['p4'][0] = predictions[runs][3] 
    df_pic['p5'][0] = predictions[runs][4] 
    df_pic['p6'][0] = predictions[runs][5] 
    df_pic['p7'][0] = predictions[runs][6] 
    df_pic['p8'][0] = predictions[runs][7]
    df_pic['correctness'][0] = synt_ans
    if i_answer < 9:
        df_pic['answer_item'][0] = df_pic[str(i_answer)][0] 
    else:
        df_pic['answer_item'][0] = 'target absent'
    df_pic['train_size'][0] = train_size 
   
    if runs == 0:
        phrase_and_pic = pd.DataFrame(df_pic, columns= ['ordinal', 'color', 'shape', '1', '2', '3', '4', '5', '6', '7', '8',
                                                          'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                                          'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8', 'index', 'max', 'correctness', 'answer_item', 'train_size'])
    else:
        phrase_and_pic = phrase_and_pic.append(df_pic, ignore_index=True)
    
    
print(phrase_and_pic)

#phrase_and_pic.to_csv('data/generalization/extrapolation/h_gen1_pred_500thirdred_embful.csv', index=False)
#phrase_and_pic.to_csv('data/generalization/interpolation/h_gen2_pred_500thirdred_embful.csv', index=False)

## Save predictions on test set (reduced embeddings)

In [None]:
predictions = model.predict(input_test)
train_size = 500

# words in word2vec representation
myembeddings = pd.read_csv('word2vec/reduced_embeddings.csv', header = 0)
myembeddings = myembeddings['embedding']
# convert the embeddings from list to string of floats
second = np.array(np.matrix(myembeddings[0])).ravel()
third = np.array(np.matrix(myembeddings[1])).ravel()
fourth = np.array(np.matrix(myembeddings[2])).ravel()
fifth = np.array(np.matrix(myembeddings[3])).ravel()
sixth = np.array(np.matrix(myembeddings[4])).ravel()
seventh = np.array(np.matrix(myembeddings[5])).ravel()
blue = np.array(np.matrix(myembeddings[6])).ravel()
green = np.array(np.matrix(myembeddings[7])).ravel()
red = np.array(np.matrix(myembeddings[8])).ravel()
ball = np.array(np.matrix(myembeddings[9])).ravel()

for runs in range(len(input_test)):

    df_pic = pd.DataFrame([[['..'],['..'],['..'],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['...', '...',],['...', '...',],['...', '...',],['...', '...',],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..'],['..'],['..'],['..'],
                            ['..'],['..'],['..'],['..'],['..']]], 
                            columns= ['ordinal', 'color', 'shape', 
                                      '1', '2', '3', '4', '5', '6', '7', '8','o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                      'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8', 'index', 'max', 'correctness','answer_item', 'train_size'])
    
    #gets the index of the highest value
    i_answer = list(predictions[runs]).index(max(predictions[runs]))+1 # index of max value
    p_answer = max(predictions[runs]) # max value
    
    df_pic['index'][0] = i_answer
    df_pic['max'][0] = p_answer
     
    # load input and split into phrase and picture
    ordnum = input_test[runs][0]
    colnum = input_test[runs][1]
    shapenum = input_test[runs][2]
    
    ordnum = list(ordnum)
    colnum = list(colnum)
    shapenum = list(shapenum)
    
    ## the phrase
    
    # save what the ordinal was
    if (ordnum[:10] == second).all() == True:
        ordinal = 'second'
        lin_ans = 2
    elif (ordnum[:10] == third).all() == True:
        ordinal = 'third'
        lin_ans = 3
    elif (ordnum[:10] == fourth).all() == True:
        ordinal = 'fourth'
        lin_ans = 4
    elif (ordnum[:10] == fifth).all() == True:
        ordinal = 'fifth'
        lin_ans = 5
    elif (ordnum[:10] == sixth).all() == True:
        ordinal = 'sixth'
        lin_ans = 6
    elif (ordnum[:10] == seventh).all() == True:
        ordinal = 'seventh'
        lin_ans = 7
    else:
        ordinal = 'ordinal?'

    #check what kind of answer the model produced
    if i_answer == list(output_test[runs]).index(max(output_test[runs]))+1:
        if output_test[runs][8] == 0:
            synt_ans = 'hierarchical'
        else:
            synt_ans = 'absence correct'
    else:
        synt_ans = 'error'
     
    # color of target phrase
    if (colnum[:10] == blue).all() == True:
        color = 'blue'
    elif (colnum[:10] == green).all() == True:
        color = 'green'
    elif (colnum[:10] == red).all() == True:
        color = 'red'
    else:
        color = 'color?'
 
    # shape of target phrase
    if (shapenum[:10] == ball).all() == True:
        shape = 'ball'
    else:
        shape = 'shape?'
    
    ## the picture

    # load input and split into phrase and picture
    picnum = input_test[runs][3]
    picnum = picnum[10:]

    smallrun = 0
    for elements in range(0,64,8):
        smallrun += 1

        colrange1 = elements
        colrange2 = elements+4
        
        shaperange1 = elements+4
        shaperange2 = elements+8
        
        colnum = list(picnum[colrange1:colrange2])
        if colnum == [0.25, 0.0, 0.0, 0.0]:
            color_pic = 'blue'
        elif colnum == [0.0, 0.25, 0.0, 0.0]:
            color_pic = 'green'
        elif colnum == [0.0, 0.0, 0.25, 0.0]:
            color_pic = 'red'

        shapenum = list(picnum[shaperange1:shaperange2])
        if shapenum == [0.0, 0.0, 0.0, 0.25]:
            shape_pic = 'ball'
        else:
            shape_pic = 'shape?'    

        props = [color_pic, shape_pic]

        df_pic[str(smallrun)][0] = props
    
    df_pic['ordinal'][0] = ordinal
    df_pic['color'][0] = color
    df_pic['shape'][0] = shape
    df_pic['o1'][0] = output_test[runs][0]    
    df_pic['o2'][0] = output_test[runs][1] 
    df_pic['o3'][0] = output_test[runs][2] 
    df_pic['o4'][0] = output_test[runs][3] 
    df_pic['o5'][0] = output_test[runs][4] 
    df_pic['o6'][0] = output_test[runs][5] 
    df_pic['o7'][0] = output_test[runs][6] 
    df_pic['o8'][0] = output_test[runs][7] 
    df_pic['o9'][0] = output_test[runs][8] 
    df_pic['p1'][0] = predictions[runs][0]    
    df_pic['p2'][0] = predictions[runs][1] 
    df_pic['p3'][0] = predictions[runs][2] 
    df_pic['p4'][0] = predictions[runs][3] 
    df_pic['p5'][0] = predictions[runs][4] 
    df_pic['p6'][0] = predictions[runs][5] 
    df_pic['p7'][0] = predictions[runs][6] 
    df_pic['p8'][0] = predictions[runs][7]
    df_pic['correctness'][0] = synt_ans
    if i_answer < 9:
        df_pic['answer_item'][0] = df_pic[str(i_answer)][0] 
    else:
        df_pic['answer_item'][0] = 'target absent'
    df_pic['train_size'][0] = train_size 
   
    if runs == 0:
        phrase_and_pic = pd.DataFrame(df_pic, columns= ['ordinal', 'color', 'shape', '1', '2', '3', '4', '5', '6', '7', '8',
                                                          'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9',
                                                          'p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8', 'index', 'max', 'correctness', 'answer_item', 'train_size'])
    else:
        phrase_and_pic = phrase_and_pic.append(df_pic, ignore_index=True)
    
    
print(phrase_and_pic)

#phrase_and_pic.to_csv('data/generalization/extrapolation/h_gen1_pred_500thirdred_embred.csv', index=False)
#phrase_and_pic.to_csv('data/generalization/interpolation/h_gen2_pred_500thirdred_embred.csv', index=False)