1. Read data
Read the data from CSV and apply some basic pre-processing (remove non-ascii characters, convert our target variable to an integer label).

In [None]:
# from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
from sklearn.manifold import TSNE
from nltk.stem import SnowballStemmer
import codecs
import csv

2. Function to clean the text data

In [49]:
def text_to_wordlist(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    #stops = set(stopwords.words("english"))
    #text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

3. Read data
Read the data from CSV and applying preprocessing steps and then converting text and labels into different matrix

In [188]:
docs = [] 
labels = []
with codecs.open(r'D:\Practice\email_classification.csv', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        docs.append(text_to_wordlist(values[0]))
        labels.append((values[1]))

4. Encoding label class as 1 if class is 'spam' else 0

In [189]:

labels = df['Class'].map(lambda x : 1 if x == 'spam' else 0)

5. Tokenize the text data

In [190]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)

6. calculation vocabulary size 

In [191]:
vocab_size = len(tokenizer.word_index) + 1

In [192]:
len(tokenizer.word_index)

50496

7. Making sequence of the documents

In [194]:
sequences = tokenizer.texts_to_sequences(docs)
print('Found %s unique tokens' % len(word_index))

Found 50496 unique tokens


8. Padding sequence with maximum length of 20

In [196]:
padded_docs = pad_sequences(sequences, maxlen=20,padding='post')
labels = np.array(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
len(data)

Shape of data tensor: (5172, 20)
Shape of label tensor: (5172,)


5172

9. Loading pre-trained dictionary of word embeddings that translates each word into a 100 dimensional vector.
More info on the project that created this dataset https://nlp.stanford.edu/projects/glove/

In [197]:
embeddings_index = dict()
f = open(r'D:\Practice\glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    #print(values)
    word = values[0]
    #print(word)
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


10. we only need the subset of these 400,000 words that appear in our docs.So , we create a weight matrix for words in training docs

In [198]:
embedding_matrix = np.zeros((vocab_size, 100))

In [199]:
for word, index in tokenizer.word_index.items():
    if index > nb_words - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [200]:
print(embedding_matrix.shape)

(50497, 100)


Creating neural network layers

In [201]:
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=20, trainable=True))
model.add(Conv1D(64, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))
model.add(Conv1D(128, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))
model.add(Conv1D(256, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_47 (Embedding)     (None, 20, 100)           5049700   
_________________________________________________________________
conv1d_107 (Conv1D)          (None, 19, 64)            12864     
_________________________________________________________________
max_pooling1d_104 (MaxPoolin (None, 9, 64)             0         
_________________________________________________________________
dropout_71 (Dropout)         (None, 9, 64)             0         
_________________________________________________________________
conv1d_108 (Conv1D)          (None, 8, 128)            16512     
_________________________________________________________________
max_pooling1d_105 (MaxPoolin (None, 4, 128)            0         
_________________________________________________________________
dropout_72 (Dropout)         (None, 4, 128)            0         
__________

Dividing the data into test and train

In [202]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=42)

taring and Evaluating the model

In [203]:
model.fit(X_train, y_train, epochs=100, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 94.106280


In [None]:
Saving the trained model for future use

In [87]:

 # serialize model to JSON
model_json = model_glove.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_glove.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 



Saved model to disk


In [88]:
from keras.models import model_from_json
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")


Loaded model from disk
