# Reading general data of the problems


In [1]:
# coding=utf-8
import json
import os

from MyUtils import clean_folder, read_files
from Word2Dim import Word2Dim

dataset_path = '.' + os.sep + 'pan19-cross-domain-authorship-attribution-training-dataset-2019-01-23'
outpath = '.' + os.sep + 'dev_out'

clean_folder(outpath)

infocollection = dataset_path + os.sep + 'collection-info.json'
problems = []
language = []
with open(infocollection, 'r') as f:
    for attrib in json.load(f):
        problems.append(attrib['problem-name'])
        language.append(attrib['language'])
print('Reading general data of the problems, done!')


Reading general data of the problems, done!


# Reading problem 1

In [8]:
problem = problems[0]
index = 0
infoproblem = dataset_path + os.sep + problem + os.sep + 'problem-info.json'
candidates = []
with open(infoproblem, 'r') as f:
    fj = json.load(f)
    unk_folder = fj['unknown-folder']
    for attrib in fj['candidate-authors']:
        candidates.append(attrib['author-name'])

candidates.sort()
# Building training set
train_docs = []
for candidate in candidates:
    train_docs.extend(read_files(dataset_path + os.sep + problem, candidate))
train_texts = [text for i, (text, label) in enumerate(train_docs)]
train_labels = [label for i, (text, label) in enumerate(train_docs)]
index_2_label_dict = {i: l for i, l in enumerate(set(train_labels))}
label_2_index_dict = {l: i for i, l in enumerate(set(train_labels))}
train_labels = [label_2_index_dict[v] for v in train_labels]
w2d = Word2Dim()
train_tokenized_with_pos, train_tokenized_indexed = w2d.fit_transform_texts(train_texts, train_labels,
                                                                            language[index])

maxlen = len(max(train_tokenized_indexed, key=len))  # We will cut the texts after # words
embedding_dim = w2d.word_embedding.shape[1]

# preparing test set
ground_truth_file = dataset_path + os.sep + problem + os.sep + 'ground-truth.json'
gt = {}
with open(ground_truth_file, 'r') as f:
    for attrib in json.load(f)['ground_truth']:
        gt[attrib['unknown-text']] = attrib['true-author']

test_docs = read_files(dataset_path + os.sep + problem, unk_folder, gt)
test_texts = [text for i, (text, label) in enumerate(test_docs)]
test_labels = [label for i, (text, label) in enumerate(test_docs)]

# Filter validation to known authors
test_texts = [text for i, (text, label) in enumerate(test_docs) if label in label_2_index_dict.keys()]
test_labels = [label for i, (text, label) in enumerate(test_docs) if label in label_2_index_dict.keys()]

test_labels = [label_2_index_dict[v] for v in test_labels]

test_tokenized_with_pos, test_tokenized_indexed = w2d.transform(test_texts)
print("Reading problem 1, done!")

doc count to process:  63
Processing doc # 7
Processing doc # 1
Processing doc # 13
Processing doc # 14
Processing doc # 8
Processing doc # 2
Processing doc # 9
Processing doc # 15
Processing doc # 3
Processing doc # 10
Processing doc # 11
Processing doc # 4
Processing doc # 16
Processing doc # 12
Processing doc # 5
Processing doc # 19
Processing doc # 6
Processing doc # 17
Processing doc # 20
Processing doc # 18
Processing doc # 25
Processing doc # 26
Processing doc # 31
Processing doc # 21
Processing doc # 27
Processing doc # 32
Processing doc # 22
Processing doc # 28
Processing doc # 33
Processing doc # 29
Processing doc # 23
Processing doc # 24
Processing doc # 30
Processing doc # 34
Processing doc # 37
Processing doc # 35
Processing doc # 43
Processing doc # 38
Processing doc # 36
Processing doc # 39
Processing doc # 44
Processing doc # 49
Processing doc # 40
Processing doc # 45
Processing doc # 50
Processing doc # 41
Processing doc # 46
Processing doc # 51
Processing doc # 42
Pro

# Keras Stuff


In [None]:
[index_2_label_dict[test_label] for test_label in test_labels]


In [None]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)


from keras import layers
from keras.models import Sequential
from keras import optimizers
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split



train_data = pad_sequences(train_tokenized_indexed, maxlen=maxlen)

test_data = pad_sequences(test_tokenized_indexed, maxlen=maxlen)

X_train, X_val, y_train, y_val = train_data, test_data, to_categorical(train_labels), to_categorical(test_labels)
# X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels,
#                                                   test_size=0.28, random_state=2019,
#                                                   stratify=train_labels)

# y_train = to_categorical(y_train)
# y_val = to_categorical(y_val)

model = Sequential()
model.add(layers.Embedding(w2d.word_embedding.shape[0], embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(32, 3, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 3, activation='relu'))
# model.add(layers.GlobalMaxPooling1D())
model.add(layers.GRU(32, dropout=0.1, recurrent_dropout=0.5))
# model.add(Dense(embedding_dim, activation='relu'))
model.add(Dense(len(set(train_labels)), activation='softmax'))
model.summary()

model.layers[0].set_weights([w2d.word_embedding])
# model.layers[0].trainable = False

model.compile(optimizer=optimizers.Adam(lr=5e-4),
              loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=120,
                    batch_size=1)

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_39 (Embedding)     (None, 1019, 9)           72225     
_________________________________________________________________
conv1d_75 (Conv1D)           (None, 1017, 32)          896       
_________________________________________________________________
max_pooling1d_38 (MaxPooling (None, 203, 32)           0         
_________________________________________________________________
conv1d_76 (Conv1D)           (None, 201, 32)           3104      
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                6240      
_________________________________________________________________
dense_45 (Dense)             (None, 9)                 297       
Total params: 82,762
Trainable params: 82,762
Non-trainable params: 0
_________________________________________________________________
Train 