In [68]:
import pandas as pd

# split
from sklearn.model_selection import StratifiedShuffleSplit

# from spam
from collections import Counter

# word embedding
from gensim.models import Word2Vec
import multiprocessing

# keras
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.models import Model
from keras.layers import Flatten
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Dense, GlobalMaxPooling1D, Activation, Dropout, GaussianNoise
from keras.layers import Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_pickle('../X_train.pickle')

In [3]:
data.head()

Unnamed: 0,tokenized_text,target
1039139,"[desperate, stay, hi, i, wan, na, start, ackno...",1
1472691,"[you, eat, moment, i, surviving, cereal, cold,...",1
1058630,"[you, break, repetitive, cycle, i, generally, ...",1
181797,"[fairly, quiet, weekend, my, son, his, friend,...",0
1073356,"[white, trash, zilch, i, fat, white, trash, zi...",1


In [100]:
sentences = list(data['tokenized_text'].values)

In [111]:
EMBED_DIM = 300
emb = Word2Vec(sentences, size=EMBED_DIM, window=3, 
               min_count=3, negative=15, iter=1, 
               workers=multiprocessing.cpu_count())
# get the word vector
word_vec = emb.wv

In [112]:
print(emb)

Word2Vec(vocab=85971, size=300, alpha=0.025)


In [113]:
emb.save('../CBOW300.bin')

In [4]:
# load model
new_model = Word2Vec.load('../CBOW300.bin')
print(new_model)

Word2Vec(vocab=85971, size=300, alpha=0.025)


In [None]:
EMBED_DIMS = [200, 400, 600, 800, 1000]
for dim in EMBED_DIMS:
    emb = Word2Vec(sentences, size=dim, window=3, 
                   min_count=3, negative=15, iter=1, 
                   workers=multiprocessing.cpu_count())
    emb.save(f'../CBOW{dim}.bin')

In [5]:
def remove_spam(data):
    dic_counter = Counter(data)
    if len(dic_counter)<=10 and len(data)>100:
        return 0
    return 1

In [6]:
data['spam'] = data['tokenized_text'].apply(lambda x: remove_spam(x))
index_spam = data[data['spam']==0].index
data.loc[list(index_spam), 'tokenized_text'] = data.loc[list(index_spam), 
                                                        'tokenized_text'].apply(lambda x: list(set(x)))

In [85]:
data.loc[list(index_spam), 'tokenized_text']

1345318    [i, fucking, hate, you, i, fucking, hate, you,...
1625692    [holy, fucking, shit, dude, get, me, get, me, ...
1381157    [teach, me, speak, teach, me, share, teach, me...
1536230    [i, care, post, deleted, express, my, feelings...
1220478    [i, think, everything, universe, multiverse, f...
1302575    [i, want, die, end, my, life, end, my, life, e...
1352931    [i, hate, myself, much, i, wan, na, die, i, ha...
782941     [days, till, new, top, gear, î, î, î, î, î, î,...
1409221    [it, like, silent, mantra, i, ca, stop, i, wis...
1345352    [stupid, bitch, stupid, fucking, ignorant, ung...
1131699    [i, happgy, kidding, we, begin, i, hate, mysel...
993573     [she, love, you, anymore, she, loves, someone,...
1513776    [i, wan, na, die, i, wan, na, die, i, wan, na,...
1187917    [i, wish, i, dead, i, wish, i, dead, i, wish, ...
1481237    [nothing, ever, get, better, nothing, ever, ge...
1607257    [my, little, poem, i, wish, i, dead, i, wish, ...
1623245    [i, wish, my,

In [89]:
set(data.loc[1364167, 'tokenized_text'])

{'everything', 'fuck', 'i', 'knew', 'man', 'my', 'name', 'oh', 'wish', 'you'}

In [67]:
data['spam'].value_counts()

In [91]:
data[data['spam']==0]

Unnamed: 0,tokenized_text,target,spam
1345318,"[fucking, you, die, hope, hate, i]",1,0
1625692,"[dude, me, fucking, get, holy, shit]",1,0
1381157,"[tell, me, go, share, aaaaaaaaaaaaaaaaaaaaaaaa...",1,0
1536230,"[care, feelings, my, i, fuck, express, deleted...",1,0
1220478,"[think, universe, multiverse, everything, fuck...",1,0
1302575,"[life, my, die, end, i, want]",1,0
1352931,"[much, na, die, wan, i, hate, myself]",1,0
782941,"[till, top, î, new, gear, days]",0,0
1409221,"[ca, dead, silent, like, wish, it, etc, mantra...",1,0
1345352,"[ignorant, fucking, ungrateful, cunt, stupid, ...",1,0


In [109]:
# GET MAX LEN IN INPUT
len_sent = data['tokenized_text'].apply(lambda x: len(x))
print(max(len_sent))

4640


In [93]:
max_word_sent = len_sent[len_sent>4000].index
max_word_sent

Int64Index([1168251, 1131836,  807483, 1125164, 1348083, 1048175, 1312185,
            1041726, 1634451, 1362631,  834659],
           dtype='int64')

In [7]:
y = pd.DataFrame(data['target'])
X = data[['tokenized_text']].copy()

In [8]:
X.reset_index(inplace = True)
y.reset_index(inplace = True)

In [9]:
X.drop(labels='index', axis=1, inplace = True)
y.drop(labels='index', axis=1, inplace = True)

Source: https://www.kaggle.com/danielsafai/cnn-implementation-of-yoon-kim-s-model

In [10]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_dev = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_dev = y.loc[train_index], y.loc[test_index]

In [11]:
word_vec = new_model.wv

In [59]:
word_vec.get_keras_embedding()

<keras.layers.embeddings.Embedding at 0x7fdd9c95e160>

In [35]:
X_train['text'] = X_train['tokenized_text'].apply(lambda x: " ".join(x))
X_dev['text'] = X_dev['tokenized_text'].apply(lambda x: " ".join(x))

In [43]:
list_X_train = list(X_train['text'].values)
list_X_dev = list(X_dev['text'].values)

In [48]:
# set the parameters
EMBED_SIZE = 300
MAX_WORDS = 85973
MAX_WORDS_IN_SENT = 4640

t = Tokenizer(num_words=MAX_WORDS)
t.fit_on_texts(list_X_train)
vocab_size = len(t.word_index) + 1

list_tokenized_train = t.texts_to_sequences(list_X_train)
list_tokenized_test = t.texts_to_sequences(list_X_dev)

In [49]:
X_train_pad = pad_sequences(list_tokenized_train, maxlen=MAX_WORDS_IN_SENT, padding='post')
X_test_pad = pad_sequences(list_tokenized_test, maxlen=MAX_WORDS_IN_SENT, padding='post')

In [71]:
y = pd.get_dummies(y_train['target']).values

In [73]:
y.shape[1]

2

In [None]:
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for word in word_vec.vocab.keys():
        # key is string word, value is numpy array for vector
        embedding[word] = asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, t.word_index)

In [75]:
inp = Input(shape=(X_train_pad.shape[1],), dtype='int64')
emb = word_vec.get_keras_embedding()(inp)
conv_filters = 100

# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
#btch1_1 = BatchNormalization()(conv1_1)
#drp1_1  = Dropout(0.2)(btch1_1)
glmp1_1 = GlobalMaxPooling1D()(conv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(emb)
#btch1_2 = BatchNormalization()(conv1_2)
#drp1_2  = Dropout(0.2)(btch1_2)
#actv1_2 = Activation('relu')(drp1_2)
glmp1_2 = GlobalMaxPooling1D()(conv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(emb)
#btch1_3 = BatchNormalization()(conv1_3)
#drp1_3  = Dropout(0.2)(btch1_3)
#actv1_3 = Activation('relu')(drp1_3)
glmp1_3 = GlobalMaxPooling1D()(conv1_3)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)
#flatten = Flatten()(cnct)
drp1 = Dropout(0.5)(cnct)

dns1  = Dense(100, activation='relu')(drp1)
#btch1 = BatchNormalization()(dns1)
#drp2  = Dropout(0.2)(btch1)

out = Dense(y.shape[1], activation='softmax')(dns1)

In [76]:
model_1 = Model(inputs=inp, outputs=out)
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_1.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 4640)         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 4640, 300)    25791300    input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 4638, 100)    90100       embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 4637, 100)    120100      embedding_6[0][0]                
____________________________________________________________________________________________

In [79]:
history_1 = model_1.fit(X_train_pad, y, validation_split=0.1, verbose=1, epochs=1, batch_size=50, shuffle=True)

Train on 1064628 samples, validate on 118292 samples
Epoch 1/1
  10000/1064628 [..............................] - ETA: 45:54:55 - loss: 0.1766 - accuracy: 0.9308

KeyboardInterrupt: 