In [2]:
TRAIN_DATASET_PATH = '../meta-learning-task/pan19-hyperpartisan-news-detection-by-article-meta-training-dataset-2019-02-04/'
GROUND_TRUTH_PATH  = '../meta-learning-task/pan19-hyperpartisan-news-detection-by-article-meta-training-dataset-2019-02-04/ground-truth/'

In [3]:
import os
import csv

data = dict()

for file in os.listdir(TRAIN_DATASET_PATH):
    if file.endswith('.txt'):
        with open(TRAIN_DATASET_PATH + file) as in_file:
            reader = csv.reader(in_file, delimiter=' ') ## csv with space delimiters
            data = {row[0]: [(w == 'true') for w in row[1:]] for row in reader}

In [4]:
truth = dict()

for file in os.listdir(GROUND_TRUTH_PATH):
    if file.endswith('.txt'):
        with open(GROUND_TRUTH_PATH + file) as in_file:
            reader = csv.reader(in_file, delimiter=' ') ## csv with space delimiters
            truth = {row[0]: (row[1] == 'true') for row in reader}
            
assert len(data) == len(truth)

In [5]:
import numpy as np

X = np.array([data[key] for key in sorted(data.keys())], dtype=np.bool)
y = np.array([truth[key] for key in sorted(data.keys())], dtype=np.bool)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [6]:
## Metrics for Simple Majority Vote Classifier
from sklearn.metrics import classification_report, accuracy_score

majority_vote = lambda x: sum(x) > (len(x) / 2)
y_pred = [majority_vote(x) for x in X]
y_test_pred = [majority_vote(x) for x in X_test]
y_train_pred = [majority_vote(x) for x in X_train]

print('\n** Majority Vote Performance **\n')
print('Accuracy on whole dataset:\t', accuracy_score(y, y_pred))
print(classification_report(y, y_pred))

print('Accuracy on test dataset:\t', accuracy_score(y_test, y_test_pred))
#print(classification_report(y_test, y_test_pred))

print('Accuracy on train dataset:\t', accuracy_score(y_train, y_train_pred))
#print(classification_report(y_train, y_train_pred))


** Majority Vote Performance **

Accuracy on whole dataset:	 0.8261904761904761
              precision    recall  f1-score   support

       False       0.82      0.84      0.83       210
        True       0.83      0.81      0.82       210

   micro avg       0.83      0.83      0.83       420
   macro avg       0.83      0.83      0.83       420
weighted avg       0.83      0.83      0.83       420

Accuracy on test dataset:	 0.7976190476190477
Accuracy on train dataset:	 0.8333333333333334


In [7]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Lambda, Concatenate, Reshape
from keras import backend as K

def create_model(num_input_features):
    inp = Input(shape=(num_input_features,), dtype='float32')
    vote = Lambda(lambda x: K.sum(x, axis=1), name='vote_sum')(inp)
    vote = Reshape((1,))(vote)
    merge = Concatenate()([inp, vote])
    dense = Dense(8, activation='relu')(merge)
    dense = Dropout(0.5)(dense)
#    dense = Dense(4, activation='relu')(dense)
#    dense = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid')(dense)

    # Compile model
    model = Model(inputs=inp, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [8]:
from sklearn.model_selection import StratifiedKFold

train_acc, test_acc = [], []
skf = StratifiedKFold(n_splits=5)
for train_idx, test_idx in skf.split(X, y):
    model = create_model(X.shape[1]) # Weights are re-initialized every iteration
    model.fit(X[train_idx], y[train_idx], epochs=250, verbose=0)
    train_score = model.evaluate(X[train_idx], y[train_idx], verbose=0)
    test_score = model.evaluate(X[test_idx], y[test_idx], verbose=0)

    train_acc.append(train_score[1])
    test_acc.append(test_score[1])

In [9]:
print('TRAIN accuracy:', sum(train_acc) / len(train_acc))
print('\t', train_acc)
print('TEST accuracy:', sum(test_acc) / len(test_acc))
print('\t', test_acc)

TRAIN accuracy: 0.8797619047619047
	 [0.8898809523809523, 0.8690476190476191, 0.8839285714285714, 0.8571428571428571, 0.8988095238095238]
TEST accuracy: 0.8404761859348842
	 [0.8333333304950169, 0.8690476133709862, 0.8333333276567005, 0.8809523752757481, 0.7857142828759693]
