## Libraries

In [17]:
from collections import Counter,defaultdict
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
import json
from bs4 import BeautifulSoup
import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model
from keras.callbacks import Callback

from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

import re

## Load-In Data/Pre-process

### Selection

In [3]:
testing = ["CNN", "SGD", "NB"]

### Combined

In [4]:
def preprocess(article):
    article = article.encode('utf-8')
    article = str(article.lower())
    return removeTagsInterpuntion(article)

def removeTagsInterpuntion(article): #remove URL's, HTML-tags and interpuntion
    article = re.sub("^https?:\/\/.*[\r\n]*", '', article)
    article = re.sub('\\\\x\w\w', '', article)
    article = re.sub('[^a-z\s]', '', article)
    return article

def loadData(path, texts, labels, highLevelLabels = True):
    with open(path) as data_file:    
        data = json.load(data_file)
    for article in data:
        if highLevelLabels:
            label = article["category"].split("|")
            labels.append(label[0])
        else:
            labels.append(article["category"])
        texts.append(preprocess(article["content"]))
    return texts, labels

In [12]:
texts = []
labels = []
paths = [r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_first.json",
        r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_second.json"]

for path in paths:
    texts, labels = loadData(path, texts, labels)
    

#Prepare splitting
VALIDATION_SPLIT = 0.2
indices = np.arange(len(labels))
np.random.shuffle(indices)
texts = np.array(texts)[indices]
labels = np.array(labels)[indices]
nb_validation_samples = int(VALIDATION_SPLIT * labels.shape[0])

### CNN

In [21]:
if "CNN" in testing or "CNN+" in testing:
    MAX_SEQUENCE_LENGTH = 1000
    MAX_NB_WORDS = 20000
    EMBEDDING_DIM = 160
    
    lb = preprocessing.LabelBinarizer()
    labelsCNN = lb.fit_transform(labels)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labelsCNN.shape)

    x_train = data[:-nb_validation_samples]
    y_train = labelsCNN[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labelsCNN[-nb_validation_samples:]
    
    
    print('Class distribution in training and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))

Found 159977 unique tokens.
Shape of data tensor: (28294, 1000)
Shape of label tensor: (28294, 18)
Class distribution in training and validation set 
[ 242  231  424  234  132  562  182  197    3  350  315  707  327   54
  157  356  175 1010]


### Baselines

In [36]:
if "NB" in testing or "SGD" in testing:
    xTrain = texts[:-nb_validation_samples]
    yTrain = labels[:-nb_validation_samples]
    xVal = texts[-nb_validation_samples:]
    yVal = labels[-nb_validation_samples:]
    
    transformer = TfidfVectorizer(smooth_idf=True, min_df=0.00000001, max_df=0.2, sublinear_tf=True)
    
    xTrain = transformer.fit_transform(xTrain)
    xVal = transformer.transform(xVal)

In [39]:
def evaluation(yVal,yPred):
    print("Accuracy is " + str(accuracy_score(yVal, yPred)))
    print(f1_score(yVal, yPred, average='macro'),f1_score(yVal, yPred, average='micro'), 
        f1_score(yVal, yPred, average='weighted'))
    print(precision_score(yVal, yPred, average='macro'),precision_score(yVal, yPred, average='micro'), 
        precision_score(yVal, yPred, average='weighted'))
    print(recall_score(yVal, yPred, average='macro'),recall_score(yVal, yPred, average='micro'), 
        recall_score(yVal, yPred, average='weighted'))

In [40]:
if "NB" in testing:
    clf = MultinomialNB()
    clf = OneVsRestClassifier(clf).fit(xTrain, yTrain)
    yPred = clf.predict(xVal)
    evaluation(yVal,yPred)

Accuracy is 0.4536938847649346
0.23335726121724246 0.4536938847649346 0.3742497128864561
0.6974315367404772 0.4536938847649346 0.687346054243276
0.2337264032451163 0.4536938847649346 0.4536938847649346


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [41]:
if "SGD" in testing:
    clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, random_state=42, max_iter=10, tol=None)
    clf = OneVsRestClassifier(clf).fit(xTrain, yTrain)
    yPred = clf.predict(xVal)
    evaluation(yVal,yPred)

Accuracy is 0.8253799929303641
0.7604934520356192 0.8253799929303641 0.8237672150883184
0.7701495221609246 0.8253799929303641 0.8239666232852711
0.7547381229678735 0.8253799929303641 0.8253799929303641


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Train Models and evaluate

### CNN

In [25]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()

In [26]:
if "CNN" in testing or "CNN+" in testing:
    GLOVE_DIR = r"C:\Users\PC-Axel\Documents\Codeer projecten\Word2Vec Vectoren\Nederlandse word2vec\combined-160.txt"
    embeddings_index = {}
    f = open(GLOVE_DIR, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 1442951 word vectors in Glove 6B 100d.


In [31]:
if "CNN" in testing:
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(5)(l_cov2)
    l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
    l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
    l_flat = Flatten()(l_pool3)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(labelsCNN.shape[1], activation='softmax')(l_dense)

    modelCNN = Model(sequence_input, preds)
    modelCNN.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

In [None]:
if "CNN" in testing:
    print("modelCNN fitting - simplified convolutional neural network")
    modelCNN.summary()
    modelCNN.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=10, batch_size=128)

In [33]:
if "CNN+" in testing:
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    # applying a more complex convolutional approach
    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Merge(mode='concat', concat_axis=1)(convs)
    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(30)(l_cov2)
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(labelsCNN.shape[1], activation='softmax')(l_dense)

    modelCNNPlus = Model(sequence_input, preds)
    modelCNNPlus.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

In [34]:
if "CNN+" in testing:
    print("modelCNNPlus fitting - more complex convolutional neural network")
    modelCNNPlus.summary()
    modelCNNPlus.fit(x_train, y_train, validation_data=(x_val, y_val),
              nb_epoch=20, batch_size=50)

## Evaluate Models