In [18]:
import argparse
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.utils import np_utils, generic_utils
np.random.seed(2018)  # for reproducibility and comparability, don't change!
import json
from sklearn.preprocessing import label_binarize

In [1]:
# Load noun-noun compound data
def load_data():
	print("Loading data...")
	# Embeddings
	embeddings = json.load(open('embeddings.json', 'r'))
	# Training and development data
	X_train = []
	Y_train = []
	with open('training_data.tsv', 'r') as f:
		for line in f:
			split = line.strip().split('\t')
			# Get feature representation
			embedding_1 = get_embedding(split[0], embeddings)
			embedding_2 = get_embedding(split[1], embeddings)
			X_train.append(embedding_1 + embedding_2)
			# Get label
			label = split[2]
			Y_train.append(label)
	classes = sorted(list(set(Y_train)))
	X_train = np.array(X_train)
	# Convert string labels to one-hot vectors
	Y_train = label_binarize(Y_train, classes)
	Y_train = np.array(Y_train)
	# Split off development set from training data
	X_dev = X_train[-3066:]
	Y_dev = Y_train[-3066:]
	X_train = X_train[:-3066]
	Y_train = Y_train[:-3066]
	print(len(X_train), 'training instances')
	print(len(X_dev), 'develoment instances')
	# Test data
	X_test = []
	Y_test = []
	with open('test_data_clean.tsv', 'r') as f:
		for line in f:
			split = line.strip().split('\t')
			# Get feature representation
			embedding_1 = get_embedding(split[0], embeddings)
			embedding_2 = get_embedding(split[1], embeddings)
			X_test.append(embedding_1 + embedding_2)
	X_test = np.array(X_test)
	print(len(X_test), 'test instances')

	return X_train, X_dev, X_test, Y_train, Y_dev, classes

In [2]:
def get_embedding(word, embeddings):
	try:
		# GloVe embeddings only have lower case words
		return embeddings[word.lower()]
	except KeyError:
		return embeddings['UNK']

In [4]:
# Build confusion matrix with matplotlib	
def create_confusion_matrix(true, pred):	
	import matplotlib.pyplot as plt
	from sklearn.metrics import confusion_matrix
	# Build matrix
	cm = confusion_matrix(true, pred, labels = classes)
	cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
	# Make plot
	plt.imshow(cm, interpolation = 'nearest', cmap=plt.cm.Blues)
	tick_marks = np.arange(len(classes))
	plt.xticks(tick_marks, classes, rotation=90)
	plt.xlabel('Predicted label')
	plt.yticks(tick_marks, classes)
	plt.ylabel('True label')
	plt.show()

In [5]:
# Load data
X_train, X_dev, X_test, Y_train, Y_dev, classes = load_data()
nb_features = X_train.shape[1]
print(nb_features, 'features')
nb_classes = Y_train.shape[1]
print(nb_classes, 'classes')

Loading data...
12261 training instances
3066 develoment instances
3831 test instances
600 features
37 classes


In [95]:
epochs = 25
batch_size = 64
run = 1

In [99]:
model = Sequential()
# Single 500-neuron hidden layer with sigmoid activation
model.add(Dense(input_dim = nb_features, units = 5000, activation = 'relu'))
model.add(Dropout(0.5))
# Output layer with softmax activation
model.add(Dense(units = nb_classes, activation = 'softmax'))
# Specify optimizer, loss and validation metric
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
# Train the model 
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_dev, Y_dev), shuffle=True, verbose=1)

# Predict labels for test set
outputs = model.predict(X_test, batch_size=batch_size)
pred_classes = np.argmax(outputs, axis=1)

# Save predictions to file
np.save(f'test_set_predictions_run{run}', pred_classes)

Train on 12261 samples, validate on 3066 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
Y_dev_names = [classes[x] for x in np.argmax(Y_dev, axis=1)]
pred_dev = model.predict(X_dev, batch_size = batch_size)
pred_class_names = [classes[x] for x in np.argmax(pred_dev, axis = 1)]
create_confusion_matrix(Y_dev_names, pred_class_names)