In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Embedding, Dropout
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from operator import itemgetter

import pandas as pd
from collections import Counter,OrderedDict

from sklearn.model_selection import train_test_split
#from lib.get_top_xwords import filter_to_top_x


In [1]:
from nltk import word_tokenize
from collections import defaultdict

def count_top_x_words(corpus, top_x, skip_top_n):
    count = defaultdict(lambda: 0)
    for c in corpus:
        for w in word_tokenize(c):
            count[w] += 1
    count_tuples = sorted([(w, c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
    return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]


def replace_top_x_words_with_vectors(corpus, top_x):
    topx_dict = {top_x[i]: i for i in range(len(top_x))}

    return [
        [topx_dict[w] for w in word_tokenize(s) if w in topx_dict]
        for s in corpus
    ], topx_dict


def filter_to_top_x(corpus, n_top, skip_n_top=0):
    top_x = count_top_x_words(corpus, n_top, skip_n_top)
    return replace_top_x_words_with_vectors(corpus, top_x)

In [4]:
df = pd.read_csv('../data/raw/train.csv')

In [5]:
df.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [31]:
df.shape

(7613, 5)

In [6]:
df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [30]:
len(description_list)

7613

In [29]:
len(word_Dict)

2500

In [32]:
description_list[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [50]:
sentLengthDist = pd.DataFrame([len(sent) for sent in mapped_list],columns=["sentLength"])["sentLength"].value_counts()
sentLengthDist.head()

7     585
9     570
8     546
10    534
6     526
Name: sentLength, dtype: int64

In [28]:
len(mapped_list)

7613

In [51]:
targetCategorical

array([[1., 0.],
       [0., 1.]], dtype=float32)

In [69]:
"""
changing lables into one hot encoded vectors
topn_labelsIndex: is a dictionary with key is labels and value as index
"""

topn = 10
freqCountLabel = Counter(df['target'].tolist())
topn_labelsIndex = {i[0]: idx for idx, i in enumerate(freqCountLabel.most_common(topn))}
labelIndexList = [topn_labelsIndex[i] for i in df['target'].tolist()]
labelOHE = to_categorical(labelIndexList)

In [70]:
"""
pad each input sequence to make it a fixed length
padding and truncating can be post or pre: How to decide??

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
"""

nVocab = 5000
topNIgnore = 10
listText = df['text'].tolist()
TextIndex, vocab_Dict = filter_to_top_x(listText, nVocab, topNIgnore)

maxTextLength = 27
paddedTextIndex = sequence.pad_sequences(TextIndex, maxlen=maxTextLength, \
                                        padding='post',truncating='post')

In [71]:
"""split the data into test and train 
"""
train_x, test_x, train_y, test_y = train_test_split(paddedTextIndex, labelOHE, test_size=0.3)

In [73]:


"""
sequential model has sequence of layers

input squence
7613X27X2500
NText*maxTextLength*nVocab

oooo...o1oo-2500
oo1o...oooo
.
.
.
oooo...o1oo
27

weight matrix
2500X100


embedding layer output
7613X27X100
NText*maxTextLength*nVocab

number of filter:hyper parameter to tune (10 for starting)

kernal
3x100



conv1D output
27X10



"""

embedding_vector_length = 500
model = Sequential()

model.add(Embedding(nVocab, embedding_vector_length, input_length=maxTextLength))

#https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv1D
model.add(Conv1D(10, 3,padding='same',strides=1))

model.add(Flatten())

model.add(Dropout(0.5))

model.add(Dense(100, activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(50, activation='relu'))


model.add(Dense(20, activation='relu'))

nLables = max(labelIndexList) + 1

model.add(Dense(nLables, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=15, batch_size=32)

y_score = model.predict(test_x)

y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(test_y)) * 100)))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy: 73.99%
