In [1]:
"""
Notebook for the Meta-Learning sub-task of the SemEval2019 Task on "Hyperpartisan News Detection".
"""
TRAIN_DATASET_PATH = '../meta-learning-task/pan19-hyperpartisan-news-detection-by-article-meta-training-dataset-2019-02-04/'
GROUND_TRUTH_PATH  = '../meta-learning-task/pan19-hyperpartisan-news-detection-by-article-meta-training-dataset-2019-02-04/ground-truth/'

In [2]:
import os
import csv

data = dict()

for file in os.listdir(TRAIN_DATASET_PATH):
    if file.endswith('.txt'):
        with open(TRAIN_DATASET_PATH + file) as in_file:
            reader = csv.reader(in_file, delimiter=' ') ## csv with space delimiters
            data = {row[0]: [(w == 'true') for w in row[1:]] for row in reader}

In [3]:
truth = dict()

for file in os.listdir(GROUND_TRUTH_PATH):
    if file.endswith('.txt'):
        with open(GROUND_TRUTH_PATH + file) as in_file:
            reader = csv.reader(in_file, delimiter=' ') ## csv with space delimiters
            truth = {row[0]: (row[1] == 'true') for row in reader}
            
assert len(data) == len(truth)

In [4]:
import numpy as np

X = np.array([data[key] for key in sorted(data.keys())], dtype=np.bool)
y = np.array([truth[key] for key in sorted(data.keys())], dtype=np.bool)

## Add column with majority vote
X_vote = np.average(X, axis=1)
X_vote = np.reshape(X_vote, X_vote.shape + (1,))
X_vote = np.concatenate((X, X_vote), axis=1)

print('X.shape: {} ; X_vote.shape: {}'.format(X.shape, X_vote.shape))

## Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vote, y, test_size=0.2, stratify=y)

X.shape: (420, 42) ; X_vote.shape: (420, 43)


In [5]:
from sklearn.metrics import classification_report, accuracy_score

## Same function as baseline provided by SemEval
majority_vote = lambda x: sum(x) >= ((len(x) - 1) / 2)

y_pred = [majority_vote(x) for x in X]
y_test_pred = [majority_vote(x) for x in X_test]
y_train_pred = [majority_vote(x) for x in X_train]

print('\n** Majority Vote Performance **\n')
print('Accuracy on whole dataset:\t', accuracy_score(y, y_pred))
print(classification_report(y, y_pred))

print('Accuracy on test dataset:\t', accuracy_score(y_test, y_test_pred))
#print(classification_report(y_test, y_test_pred))

print('Accuracy on train dataset:\t', accuracy_score(y_train, y_train_pred))
#print(classification_report(y_train, y_train_pred))


** Majority Vote Performance **

Accuracy on whole dataset:	 0.8214285714285714
              precision    recall  f1-score   support

       False       0.83      0.81      0.82       210
        True       0.82      0.83      0.82       210

   micro avg       0.82      0.82      0.82       420
   macro avg       0.82      0.82      0.82       420
weighted avg       0.82      0.82      0.82       420

Accuracy on test dataset:	 0.8571428571428571
Accuracy on train dataset:	 0.8125


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

rf_clf = RandomForestClassifier(
    n_estimators=100, min_samples_leaf=3, random_state=42 ## min_samples_leaf=3 ?
)

## Train and cross-validate with 5 folds
cv = cross_validate(rf_clf, X_vote, y, cv=10, return_train_score=True, scoring='accuracy')
rf_cv_acc = sum(cv['test_score']) / len(cv['test_score'])

print('CV Test score:', rf_cv_acc)
print('CV Test scores:', cv['test_score'])

print('CV Train score:', sum(cv['train_score']) / len(cv['train_score']))
print('CV Train scores:', cv['train_score'])

CV Test score: 0.8547619047619047
CV Test scores: [0.92857143 0.73809524 0.9047619  0.83333333 0.80952381 0.9047619
 0.88095238 0.95238095 0.80952381 0.78571429]
CV Train score: 0.8973544973544975
CV Train scores: [0.88888889 0.9021164  0.8968254  0.8994709  0.9047619  0.89153439
 0.8968254  0.8968254  0.8994709  0.8968254 ]


In [7]:
rf_clf.fit(X_train, y_train)
print('Accuracy score:', rf_clf.score(X_test, y_test))

rf_clf.feature_importances_

Accuracy score: 0.8690476190476191


array([0.00828001, 0.00925355, 0.00232323, 0.01154393, 0.03597662,
       0.03466775, 0.01042077, 0.00412563, 0.02134086, 0.01786668,
       0.15511596, 0.00945162, 0.00716676, 0.04166345, 0.0535333 ,
       0.01684613, 0.0150044 , 0.03384055, 0.00474503, 0.00779191,
       0.03363042, 0.05459089, 0.00638681, 0.01283064, 0.00359524,
       0.00308327, 0.04835916, 0.00888776, 0.038552  , 0.00544551,
       0.0145684 , 0.00599618, 0.00632205, 0.01504197, 0.01507528,
       0.00661625, 0.00734167, 0.00376311, 0.00394275, 0.00463466,
       0.00876711, 0.0332938 , 0.15831694])

In [8]:
from sklearn.svm import LinearSVC
svm = LinearSVC(max_iter=2000)
cv = cross_validate(svm, X_vote, y, cv=5, return_train_score=True)

print('Linear SVM')
print('TRAIN score:', sum(cv['train_score']) / len(cv['train_score']))
print('TEST score:', sum(cv['test_score']) / len(cv['test_score']))

Linear SVM
TRAIN score: 0.8720238095238096
TEST score: 0.8142857142857144


In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
cv = cross_validate(knn, X_vote, y, cv=5, return_train_score=True)

print('KNN')
print('TRAIN score:', sum(cv['train_score']) / len(cv['train_score']))
print('TEST score:', sum(cv['test_score']) / len(cv['test_score']))

KNN
TRAIN score: 0.8375
TEST score: 0.8380952380952381


In [10]:
from sklearn.linear_model import LogisticRegressionCV
logist = LogisticRegressionCV(Cs=10)
cv = cross_validate(logist, X_vote, y, cv=5, return_train_score=True)

print('Logistic Regression Classifier')
print('TRAIN score:', sum(cv['train_score']) / len(cv['train_score']))
print('TEST score:', sum(cv['test_score']) / len(cv['test_score']))



Logistic Regression Classifier
TRAIN score: 0.8577380952380953
TEST score: 0.8333333333333334




In [11]:
from sklearn.ensemble import GradientBoostingClassifier
gboost = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=5, min_samples_split=5, max_depth=5)
cv = cross_validate(gboost, X_vote, y, cv=5, return_train_score=True)

print('Gradient Boosting')
print('TRAIN score:', sum(cv['train_score']) / len(cv['train_score']))
print('TEST score:', sum(cv['test_score']) / len(cv['test_score']))

Gradient Boosting
TRAIN score: 0.9827380952380953
TEST score: 0.8190476190476191


In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, )
cv = cross_validate(adaboost, X_vote, y, cv=5, return_train_score=True)

print('Gradient Boosting')
print('TRAIN score:', sum(cv['train_score']) / len(cv['train_score']))
print('TEST score:', sum(cv['test_score']) / len(cv['test_score']))

Gradient Boosting
TRAIN score: 0.8738095238095237
TEST score: 0.8047619047619048


In [13]:
### 
### Random Forest classifier performs better than an MLP or LinearSVM, we'll continue with this classifier
###