In [1]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from BenHamner.score import *

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 100000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


print('Indexing word vectors.')

embeddings_index = {}
counter = 0
max_words = -1  #use if you want to stop early (use fewer word vectors)
f = open(r"C:\Users\Edvin\Projects\Data\glove.6B\glove.6B.100d.txt", encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    counter += 1
    if counter == max_words:
        break;
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedded_words = []
for word, vector in embeddings_index.items():
    embedded_words.append(word)

    
    
def quadratic_weighted_kappa_for_cnn(x_val, d_val, model):

    p = model.predict([x_val])
    y_test = []
    d_test = []
    for i in range(len(x_val)):
        y_test.append(np.argmax(p[i]))
        d_test.append(np.argmax(d_val[i]))

    kappa = quadratic_weighted_kappa(d_test, y_test)
    return kappa

Using TensorFlow backend.


Indexing word vectors.
Found 400000 word vectors.


In [3]:
import csv

def read_dataset(*args): #takes either two integers for first and last essay, or a list of integers for which prompts to include

    if len(args) == 2:
        if not isinstance(args[0], int):
            print("read_dataset: wrong input")
            exit()
        data = []
        start = args[0]
        end = args[1]
        counter = 0
        if start < 1:
            start = 1
        #with open("/home/william/m18_edvin/Projects/Data/asap-aes/training_set_rel3.tsv", newline='', encoding='latin1') as f:
        with open("C:/Users/Edvin/Projects/Data/asap-aes/training_set_rel3.tsv", newline='', encoding='latin1') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                counter += 1
                if counter <= start:
                    continue
                data.append(row)
                if counter == end+1:
                    break

    elif isinstance(args[0], list) and len(args) == 1:
        data = []
        skipfirstline = True
        with open("/home/william/m18_edvin/Projects/Data/asap-aes/training_set_rel3.tsv", newline='', encoding='latin1') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                if skipfirstline == True:
                    skipfirstline = False
                    continue
                if row[1] == 1:
                    print("tes")
                if int(row[1]) not in args[0]:
                    continue
                data.append(row)
    else:
            print("read_dataset: wrong input")


    return data


In [6]:
import csv

data = read_dataset(0,1246)



print('Processing text dataset')

texts = []  # list of text samples
essayset  = [] #list of which set each text belongs to
essaynumber = []
targets = []

for row in data:
    texts.append(row[2])
    essayset.append(int(row[1]))
    essaynumber.append(int(row[0]))
    targets.append(int(row[6])-2) #changing grades from 2-13 to 0-11


print('Found %s texts. ' % len(texts))






tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts) #list of all texts where words are numbers instead
word_index = tokenizer.word_index #dictionary mapping each word to the correct number
#print(word_index)

words_from_text = []
for word, index in word_index.items():
    words_from_text.append(word)
    
#print(words_from_text)

print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) #adds zeros to beginning of text if it is shorter than MAX_SEQUENCE_LENGTH
targets = to_categorical(np.asarray(targets)) #creates a target vector for each text. If a text belongs to class 0 out of 4 classes the vector will be: [1., 0., 0., 0.]
essayset = np.array(essayset)
essaynumber = np.array(essaynumber)

print('Shape of data tensor:', data.shape)
print('Shape of target tensor:', targets.shape)

Processing text dataset
Found 1246 texts. 
Found 13310 unique tokens.
Shape of data tensor: (1246, 1000)
Shape of target tensor: (1246, 11)


In [8]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0]) #creates an array with integers up to the total number of texts (data.shape[0]). ex: [0  1  2  3 ... 1998  1999]
#np.random.shuffle(indices)
data = data[indices]
essayset = essayset[indices]
essaynumber = essaynumber[indices]
targets = targets[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = targets[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = targets[-num_validation_samples:]


print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.randint(100,1000,EMBEDDING_DIM)/1000
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)


count = 0
zeros = np.zeros(100)
for i in range(num_words):
        if embedding_matrix[i][0] == 0 and embedding_matrix[i][1] == 0 and embedding_matrix[i][2] == 0 and embedding_matrix[i][3] == 0:
            count += 1

print("unused words and total words: ", count, "/", num_words)



print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(11, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()
kappa = quadratic_weighted_kappa_for_cnn(x_val, y_val, model)
print("kappa: ", kappa)

for i in range(0):
    model.fit(x_train, y_train, batch_size=128, epochs=2, verbose=2, validation_data=(x_val, y_val))
    kappa = quadratic_weighted_kappa_for_cnn(x_val, y_val, model)
    print("kappa: ", kappa)
    print(" ")

val_loss, val_acc = model.evaluate(x_train, y_train, verbose=2)
print("training loss and acc: ", val_loss, val_acc)
val_loss, val_acc = model.evaluate(x_val, y_val, verbose=2)
print("validation loss and acc: ", val_loss, val_acc)
kappa = quadratic_weighted_kappa_for_cnn(x_val, y_val, model)
print("kappa: ", kappa)
# #model.save("kerasmodel")


count = 0
zeros = np.zeros(100)
for i in range(num_words):
        if embedding_matrix[i][0] == 0 and embedding_matrix[i][1] == 0 and embedding_matrix[i][2] == 0 and embedding_matrix[i][3] == 0:
            count += 1

print("unused words and total words: ", count,num_words)



Preparing embedding matrix.
unused words and total words:  1 / 13311
Training model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 1000, 100)         1331100   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________

In [None]:
#emptywords = []
#for word in words_from_text:
#    if word not in embedded_words:
#        emptywords.append(word)


In [5]:
def create_model(MAX_SEQUENCE_LENGTH, embedding_layer, layers = 1, kernels = 1, kernel_length = 1):

    # train a 1D convnet with global maxpooling
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(kernels, kernel_length, activation='relu')(embedded_sequences)
    for layers in range(1, layers):
        x = MaxPooling1D(5)(x)
        x = Conv1D(kernels, kernel_length, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(5, activation='sigmoid')(x)
    preds = Dense(11, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    return model

In [None]:
model = create_model( MAX_SEQUENCE_LENGTH, embedding_layer, layers = 1, kernels = 128, kernel_length = 5)



for i in range(20):
    model.fit(x_train, y_train, batch_size=128, epochs=2, verbose=2, validation_data=(x_val, y_val))
    kappa = quadratic_weighted_kappa_for_cnn(x_val, y_val, model)
    print("kappa: ", kappa)
    print(" ")

val_loss, val_acc = model.evaluate(x_train, y_train, verbose=2)
print("training loss and acc: ", val_loss, val_acc)
val_loss, val_acc = model.evaluate(x_val, y_val, verbose=2)
print("validation loss and acc: ", val_loss, val_acc)
kappa = quadratic_weighted_kappa_for_cnn(x_val, y_val, model)
print("kappa: ", kappa)
# #model.save("kerasmodel")