In [1]:
from collections import Counter,defaultdict
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from pprint import pprint
import json
from bs4 import BeautifulSoup
import sys
import os

#os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model, load_model
from keras.callbacks import Callback

from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def preprocess(article):
    article = article.encode('utf-8')
    article = str(article.lower())
    return removeTagsInterpuntion(article)

def removeTagsInterpuntion(article): #remove URL's, HTML-tags and interpuntion
    article = re.sub("^https?:\/\/.*[\r\n]*", '', article)
    article = re.sub('\\\\x\w\w', '', article)
    article = re.sub('[^a-z\s]', '', article)
    return article

def loadData(path, texts, labels, urls, highLevelLabels = True):
    with open(path) as data_file:    
        data = json.load(data_file)
    for article in data:
        text = preprocess(article["content"])
        if len(text.split()) > 9 and article["category"] != "NOCAT": #Remove small sentences
            if article["url"] not in urls:
                urls.append(article["url"])
                texts.append(text)
                if highLevelLabels:
                    label = article["category"].split("|")
                    labels.append(label[0])
                else:
                    labels.append(article["category"])
    return texts, labels, urls

In [4]:
texts = []
labels = []
urls = []
paths = [r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_first.json",
         r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_second.json",
        r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_2017.json",
        r"C:\Users\PC-Axel\Documents\github\thesis\Data\PoliFLW Data\kamerstukken_topics_2016.json"]

for path in paths:
    texts, labels, urls = loadData(path, texts, labels, urls)
    

#Prepare splitting
VALIDATION_SPLIT = 0.2
indices = np.arange(len(labels))
np.random.shuffle(indices)
texts = np.array(texts)[indices]
labels = np.array(labels)[indices]
nb_validation_samples = int(VALIDATION_SPLIT * labels.shape[0])

In [5]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 160

lb = preprocessing.LabelBinarizer()
labelsCNN = lb.fit_transform(labels)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


Found 161810 unique tokens.


In [6]:
GLOVE_DIR = r"C:\Users\PC-Axel\Documents\Codeer projecten\Word2Vec Vectoren\Nederlandse word2vec\combined-160.txt"
embeddings_index = {}
f = open(GLOVE_DIR, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 1442951 word vectors in Glove 6B 100d.


In [7]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(labelsCNN.shape[1], activation='softmax')(l_dense)

modelCNN = Model(sequence_input, preds)
modelCNN.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [11]:
for percentage in [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]:
    np.random.shuffle(indices)
    data = np.array(data)[indices]
    labelsCNN = np.array(labelsCNN)[indices]
    x_train = data[:-nb_validation_samples]
    y_train = labelsCNN[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labelsCNN[-nb_validation_samples:]
    
    print()
    x_train = x_train[:round(percentage*x_train.shape[0])]
    y_train = y_train[:round(percentage*y_train.shape[0])]
    x_val = x_val[:round(percentage*x_val.shape[0])]
    y_val = y_val[:round(percentage*y_val.shape[0])]
    
    modelCNN = Model(sequence_input, preds)
    modelCNN.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
      
    historyCNN = modelCNN.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=128)#,callbacks=[metrics])
    
    modelCNN.save('models/CNN-Model-17-' + str(percentage).replace(".",""))  # creates a HDF5 file 'my_model.h5'
    with open('models/CNN-Model-17-history-' + str(percentage).replace(".",""), 'wb') as file_pi:
        pickle.dump(historyCNN.history, file_pi)


Train on 233 samples, validate on 58 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Train on 465 samples, validate on 116 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [19]:
print(histories[0].history["loss"])
losses = []
for history in histories:
    print(min(history.history["val_acc"]))

[2.804887858592854, 2.6581932932817223, 2.659051361747909, 2.639732904479285, 2.626328833686163, 2.607622105623449, 2.5767467257594365, 2.530161949509349, 2.476153380685371, 2.3935008974202363, 2.3110649102124525, 2.185770446566691, 2.018958732709897, 1.8451103406962683, 1.662295182426446, 1.439953331226026, 1.2460666195128063, 1.006705300255707, 0.8300762046648803, 0.6590682172601091]
0.04295532660129963
0.12037833195150872
0.19369627478129542
