In [45]:
import pandas as pd
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import helpers

In [46]:
# helpers.createCsv()
data = pd.read_csv("17k-tweets.csv")

features = data.iloc[:,0].values
labels = data.iloc[:,1].values

stop_words = helpers.get_external_stopwords()
features = helpers.cleanFeatures(features, stop_words)

le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels) 
labels = to_categorical(labels)


In [63]:
RANDOM_STATE = 1
MAX_NB_WORDS = 2500

def find_max_length(features):
    length = 0    
    for sentence in features:
        if len(sentence) > length:
            length = len(sentence)
    return length

maxlen = find_max_length(x_train)
print (maxlen)

# Split train & test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=RANDOM_STATE)

# Tokenize and transform to integer index
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

VOCAB_SIZE = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

30


In [64]:
# Define CNN architecture


EMBEDING_DIM = 100
EPOCHS = 8
BATCH_SIZE = 200
filters = 20
kernel_size = 3
hidden_dims = 250


model = Sequential()
model.add(layers.Embedding(VOCAB_SIZE, EMBEDING_DIM, input_length=maxlen))
model.add(Dropout(0.5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=EPOCHS,
                    verbose=1,
                    validation_data=(x_test, y_test),
                    batch_size=BATCH_SIZE)
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 30, 100)           1924200   
_________________________________________________________________
dropout_12 (Dropout)         (None, 30, 100)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 26, 128)           64128     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                

In [31]:
# Define CNN architecture
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

EMBEDING_DIM = 100
EPOCHS = 8
BATCH_SIZE = 200
filters = 20
kernel_size = 3
hidden_dims = 250

# # channel 1
# 	inputs1 = Input(shape=(length,))
# 	embedding1 = Embedding(vocab_size, 100)(inputs1)
# 	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
# 	drop1 = Dropout(0.5)(conv1)
# 	pool1 = MaxPooling1D(pool_size=2)(drop1)
# 	flat1 = Flatten()(pool1)

model = Sequential()
model.add(layers.Embedding(VOCAB_SIZE, EMBEDING_DIM, input_length=maxlen))
# model.add(Dropout(0.5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=EPOCHS,
                    verbose=1,
                    validation_data=(x_test, y_test),
                    batch_size=BATCH_SIZE)
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 44, 100)           1924200   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 40, 128)           64128     
_________________________________________________________________
dropout_7 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 20, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2560)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 64)                163904    
_________________________________________________________________
dense_8 (Dense)              (None, 3)                