In [1]:
import numpy as np
import data_helpers
from w2v import train_word2vec
import tensorflow as tf
import pandas as pd
sess = tf.Session()
from keras.callbacks import ModelCheckpoint

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D , Conv1D
from keras import backend as K
K.set_session(sess)

np.random.seed(2)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class GetOutOfLoop( Exception ):
    pass

In [3]:
def runtest(model, test_file, seqeunce_length, label=0):
    acc = 0.0
    t = 0.0
    all = 0.0
    for text in test_file:
        x_test = data_helpers.process_sentence_vocabulary(text, vocabulary , sequence_length)
        #print(x_test)
        y_test = model.predict(x_test)
        #print(y_test)
        result = model.predict_proba(x_test)
        res = 0
        if y_test[0][label] >= 0.5:
            res = 1
            t=t+1
        else:
            #print(text)
            pass
        all=all+1
        #break
    acc = t*100.0/all
    return acc,t,(all-t)

In [4]:
def createModel(config, x, vocabulary, vocabulary_inv):
    # Step to create Word2Vec
    if config['model_variation']=='CNN-non-static' or config['model_variation']=='CNN-static':
        embedding_weights = train_word2vec(x, vocabulary_inv, config['embedding_dim'], 
                                           config['min_word_count'], config['context'] , config['seg'])
        if config['model_variation']=='CNN-static':
            x = embedding_weights[0][x]
    elif config['model_variation']=='CNN-rand':
        embedding_weights = None
    else:
        raise ValueError('Unknown model variation')
        
    # Step to create convolution layer    
    graph_in = Input(shape=(config['sequence_length'], config['embedding_dim']))
    convs = []
    for fsz in config['filter_sizes']:
        conv = Conv1D(       kernel_size=config['num_filters'],
                             filters=fsz,
                             padding='valid',
                             activation='relu', strides=1 ,
                             )(graph_in)
        pool = MaxPooling1D(pool_size=2)(conv)
        flatten = Flatten()(pool)
        convs.append(flatten)

    # Step to create Merge Layer    
    if len(config['filter_sizes'])>1:
        out = Merge(mode='concat')(convs)
    else:
        out = convs[0]

    graph = Model(input=graph_in, output=out)

    # main sequential model
    model = Sequential()
    if not config['model_variation']=='CNN-static':
        model.add(Embedding(len(vocabulary), config['embedding_dim'], input_length=config['sequence_length'],
                            weights=embedding_weights))

    # Step to create Fully Connect Neural Network    
    model.add(Dropout(dropout_prob[0], input_shape=(config['sequence_length'], config['embedding_dim'])))
    model.add(graph)
    model.add(Dense(config['hidden_dims']))
    model.add(Dropout(config['dropout_prob'][1]))
    model.add(Activation('relu'))
    model.add(Dense(2))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [5]:
def writeResult(config):
    index = ['seg', 'model_variation', 'filter_sizes','num_filters',
         'context','embedding_dim','hidden_dims','batch_size',
         'min_word_count','dropout_prob','num_epochs',
         'sequence_length','val_split','Acc positive','Acc negative',
         'True positive','False positive','True negative','False negative']
    df = pd.DataFrame(columns=index)
    df.loc[0]=config
    df.to_csv(directory+'/Result-2/result_clickbait_cnn.csv', mode='a' , header=False)

In [6]:
# initiat parameter:
seg_types = ['seg_tlex','seg_sylSeg','seg_lextoplus','seg_tcc','seg_icu']
model_variations = ['CNN-non-static' , 'CNN-non-static', 'CNN-static'] 
max_words = 10000
embedding_dims = [10,25,50,100,150]
filter_sizes_array = [(1,),(1,2),(1,2,3),(1,2,3,4),(1,2,3,4,5),(1,2,3,4,5,6),(1,2,3,4,5,6,7)]
num_filters_array = [32,64,128,256]
dropout_prob_array = [(0.1,0.1),(0.1,0.25),(0.25,0,25),(0.25, 0.5),(0.5,0,5)]
hidden_dims_array = [32,64,128,256] 
batch_size_array = [32,64,128,256,512] 
num_epochs_array = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count
context = 10        # Context window size
val_split = 0.1
directory = 'Dataset/'


In [8]:
for seg in seg_types:
    clickbait_train = directory +  'Train/all_clickbait_' + seg + '_Train'
    non_clickbait_train = directory + 'Train/all_non_clickbait_' + seg + '_Train'
    
    #process data
    x, y, vocabulary, vocabulary_inv = data_helpers.load_data(clickbait_train,non_clickbait_train)
    np.save(directory + 'vocabulary/vocabulary-' + seg_types[0], vocabulary)
    np.save(directory + 'vocabulary/vocabulary_inv-' + seg_types[0], vocabulary_inv)
    print ('max sequence lenght:', max(len(l) for l in x))
    sequence_length =  max(len(l) for l in x)
    try:
        for model_variation in model_variations:
            for embedding_dim in embedding_dims:
                for filter_sizes in filter_sizes_array:
                    for num_filters in num_filters_array:
                        for dropout_prob in dropout_prob_array:
                            #for hidden_dims in hidden_dims_array:
                                #for batch_size in batch_size_array:
                                    #for num_epochs in num_epochs_array:
                            hidden_dims=128
                            batch_size=32
                            num_epochs=20
                            config = {}
                            config['seg']=seg
                            config['embedding_dim']=embedding_dim
                            config['filter_sizes']=filter_sizes
                            config['num_filters']=num_filters
                            config['dropout_prob']=dropout_prob
                            config['hidden_dims']=hidden_dims
                            config['batch_size']=batch_size
                            config['num_epochs']=num_epochs
                            config['min_word_count']=min_word_count
                            config['context']=context
                            config['val_split']=val_split
                            config['model_variation']=model_variation
                            config['sequence_length']=sequence_length
                            print('start model:',config)
                            model = createModel(config, x, vocabulary, vocabulary_inv)

                            # Save model to file

                            name = "{}_{:d}embedding_dim_{}filter_sizes_{:d}num_filters_{:d}hidden_dims_{:d}batch_size_{}dropout_prob".format(seg,embedding_dim, str(filter_sizes), num_filters,hidden_dims,batch_size,dropout_prob)
                            filepath = directory+'Model-2/save_clickbait_'+ name + '{epoch:02d}num_epochs_{val_acc:.2f}val_acc.h5'
                            checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
                            callbacks_list = [checkpoint]
                            model.fit(x, y, batch_size=batch_size, 
                                      epochs=num_epochs,validation_split=val_split ,callbacks=callbacks_list, verbose=1)


                            #model.save(directory+'Model-2/save_clickbait_'+ name +'.h5')
                            print('Saved Model',name)
                            # Evaluate the model
                            print('Start Evaluate:')
                            clickbait_test = directory+'Test/all_clickbait_'+seg+'_Test'
                            non_clickbait_test = directory+'Test/all_non_clickbait_'+seg+'_Test'
                            test_clickbait = list(open(clickbait_test,'r').read().splitlines())
                            test_non_clickbait = list(open(non_clickbait_test,'r').read().splitlines())
                            acc_t,tp,fp = runtest(model, test_clickbait, sequence_length, 0)
                            acc_n,tn,fn = runtest(model, test_non_clickbait, sequence_length, 1)
                            config['Acc positive'] = acc_t
                            config['Acc negative'] = acc_n
                            config['True positive'] = tp
                            config['False positive'] = fp
                            config['True negative'] = tn
                            config['False negative'] = fn
                            writeResult(config)
                                        #raise GetOutOfLoop
    except GetOutOfLoop:
        pass

max sequence lenght: 182
start model: {'batch_size': 32, 'num_filters': 32, 'num_epochs': 50, 'min_word_count': 1, 'dropout_prob': (0.1, 0.1), 'val_split': 0.1, 'embedding_dim': 10, 'seg': 'seg_tlex', 'hidden_dims': 128, 'sequence_length': 182, 'model_variation': 'CNN-non-static', 'context': 10, 'filter_sizes': (1,)}
Load existing Word2Vec model '10features_1minwords_10context_seg_tlex_'
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead




Train on 104382 samples, validate on 11599 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.94422, saving model to seg_tlex_10embedding_dim_(1,)filter_sizes_32num_filters_128hidden_dims_32batch_size_50num_epochs_(0.1, 0.1)dropout_prob01-0.94.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.94422 to 0.96530, saving model to seg_tlex_10embedding_dim_(1,)filter_sizes_32num_filters_128hidden_dims_32batch_size_50num_epochs_(0.1, 0.1)dropout_prob02-0.97.h5
Epoch 3/50

Epoch 00003: val_acc did not improve
Epoch 4/50

Epoch 00004: val_acc improved from 0.96530 to 0.96676, saving model to seg_tlex_10embedding_dim_(1,)filter_sizes_32num_filters_128hidden_dims_32batch_size_50num_epochs_(0.1, 0.1)dropout_prob04-0.97.h5
Epoch 5/50

Epoch 00005: val_acc did not improve
Epoch 6/50

Epoch 00006: val_acc did not improve
Epoch 7/50

Epoch 00007: val_acc did not improve
Epoch 8/50

Epoch 00008: val_acc did not improve
Epoch 9/50

Epoch 00009: val_acc did not improve
Epoch 10/50

Epoch 

KeyboardInterrupt: 