In [1]:
import pandas as pd
import numpy as np

# split
from sklearn.model_selection import StratifiedShuffleSplit

# from spam
from collections import Counter

# word embedding
from gensim.models import Word2Vec
import multiprocessing

# count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# keras
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.models import Model
from keras.layers import Flatten
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Dense, GlobalMaxPooling1D, Activation, Dropout, GaussianNoise
from keras.layers import Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
data = pd.read_pickle('../X_train_temp.pickle')

In [5]:
def remove_spam(data):
    dic_counter = Counter(data)
    if len(dic_counter)<=10 and len(data)>100:
        return 0
    return 1

In [6]:
data['spam'] = data['tokenized_text'].apply(lambda x: remove_spam(x))
index_spam = data[data['spam']==0].index
data.loc[list(index_spam), 'tokenized_text'] = data.loc[list(index_spam), 
                                                        'tokenized_text'].apply(lambda x: list(set(x)))

In [7]:
data['sent'] = data['tokenized_text'].apply(lambda x: " ".join(x))

In [8]:
data.to_pickle('../X_train_temp.pickle')

In [9]:
data.head()

Unnamed: 0,tokenized_text,target,spam,sent
800000,"[i, love, u, guys, r, best]",0,1,i love u guys r best
800001,"[im, meeting, one, my, besties, tonight, cant,...",0,1,im meeting one my besties tonight cant wait gi...
800002,"[thanks, twitter, add, sunisa, i, got, meet, y...",0,1,thanks twitter add sunisa i got meet you hin s...
800003,"[sick, really, cheap, it, hurts, much, eat, re...",0,1,sick really cheap it hurts much eat real food ...
800004,"[he, effect, everyone]",0,1,he effect everyone


In [None]:
sentences = list(data['tokenized_text'].values)

### Train a word embedding using CBOW

In [None]:
EMBED_DIM = 300
emb = Word2Vec(sentences, size=EMBED_DIM, window=3, 
               min_count=3, negative=15, iter=1, 
               workers=multiprocessing.cpu_count())
# get the word vector
word_vec = emb.wv

In [None]:
print(emb)

In [None]:
emb.save('../CBOW300.bin')

In [None]:
# load model
new_model = Word2Vec.load('../CBOW300.bin')
print(new_model)

In [None]:
EMBED_DIMS = [200, 400, 600, 800, 1000]
for dim in EMBED_DIMS:
    emb = Word2Vec(sentences, size=dim, window=3, 
                   min_count=3, negative=15, iter=1, 
                   workers=multiprocessing.cpu_count())
    emb.save(f'../CBOW{dim}.bin')

### Train word embedding using skip-gram

In [None]:
EMBED_DIM = 300
emb = Word2Vec(sentences, size=EMBED_DIM, window=3, 
               min_count=3, negative=15, iter=1, sg=1, 
               workers=multiprocessing.cpu_count())
# get the word vector
word_vec = emb.wv

In [None]:
print(emb)

In [None]:
emb.save('../SKIP-GRAM300.bin')

In [None]:
# GET MAX LEN IN INPUT
len_sent = data['tokenized_text'].apply(lambda x: len(x))
print(max(len_sent))

In [None]:
max_word_sent = len_sent[len_sent>4000].index
max_word_sent

### Split the train to train and dev

In [None]:
y = pd.DataFrame(data['target'])
X = data[['tokenized_text']].copy()

In [None]:
X.reset_index(inplace = True)
y.reset_index(inplace = True)

In [None]:
X.drop(labels='index', axis=1, inplace = True)
y.drop(labels='index', axis=1, inplace = True)

Source: https://www.kaggle.com/danielsafai/cnn-implementation-of-yoon-kim-s-model

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_dev = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_dev = y.loc[train_index], y.loc[test_index]

### Performance Template

In [130]:
def performance(y_true, y_hat):
    
    y_true = list(map(lambda x: np.argmax(x), y_true))
    y_hat = list(map(lambda x: np.argmax(x), y_hat))
    
    print('-'*40)
    # accuracy
    print('Accuracy: ', accuracy_score(y_true,y_hat))
    # confusion matrix
    print('\n')
    print('Confusion Matrix: \n', confusion_matrix(y_true,y_hat))
    print('\n')
    # precision score of the model 
    print('Precision: ', precision_score(y_true, y_hat))
    # recall score of the model 
    print('Recall: ', recall_score(y_true, y_hat))
    # area under the ROC curve
    print('Area under ROC curve: ', roc_auc_score(y_true, y_hat))
    print('-'*40)


### Design the CNN Non-Static Kim Yoon

In [None]:
word_vec = new_model.wv

In [None]:
X_train['text'] = X_train['tokenized_text'].apply(lambda x: " ".join(x))
X_dev['text'] = X_dev['tokenized_text'].apply(lambda x: " ".join(x))

In [None]:
list_X_train = list(X_train['text'].values)
list_X_dev = list(X_dev['text'].values)

In [63]:
# set the parameters
EMBED_SIZE = 300
MAX_WORDS = 85971
#MAX_WORDS_IN_SENT = 4640
SET_LIMIT_SENTENCE = 150

t = Tokenizer(num_words=MAX_WORDS)
t.fit_on_texts(list_X_train)
vocab_size = len(t.word_index) + 1

list_tokenized_train = t.texts_to_sequences(list_X_train)
list_tokenized_test = t.texts_to_sequences(list_X_dev)

In [64]:
X_train_pad = pad_sequences(list_tokenized_train, maxlen=SET_LIMIT_SENTENCE, padding='post')
X_test_pad = pad_sequences(list_tokenized_test, maxlen=SET_LIMIT_SENTENCE, padding='post')

In [None]:
# get the dummy for y values
y = pd.get_dummies(y_train['target']).values
y_test = pd.get_dummies(y_dev['target']).values

In [75]:
sliced_X_train_pad = X_train_pad[:50000]
sliced_y = y[:50000]

In [66]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 300))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        try:
            weight_matrix[i] = embedding[word]
        except:
            pass
    return weight_matrix

# get vectors in the right order
embedding_vectors = get_weight_matrix(word_vec, t.word_index)

In [67]:
inp = Input(shape=(X_train_pad.shape[1],), dtype='int64')
#emb = word_vec.get_keras_embedding()(inp)
emb = Embedding(vocab_size, EMBED_SIZE, weights=[embedding_vectors])(inp)
conv_filters = 100

# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
#btch1_1 = BatchNormalization()(conv1_1)
#drp1_1  = Dropout(0.2)(btch1_1)
glmp1_1 = GlobalMaxPooling1D()(conv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(emb)
#btch1_2 = BatchNormalization()(conv1_2)
#drp1_2  = Dropout(0.2)(btch1_2)
#actv1_2 = Activation('relu')(drp1_2)
glmp1_2 = GlobalMaxPooling1D()(conv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(emb)
#btch1_3 = BatchNormalization()(conv1_3)
#drp1_3  = Dropout(0.2)(btch1_3)
#actv1_3 = Activation('relu')(drp1_3)
glmp1_3 = GlobalMaxPooling1D()(conv1_3)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)
#flatten = Flatten()(cnct)
drp1 = Dropout(0.5)(cnct)

dns1  = Dense(100, activation='relu')(drp1)
#btch1 = BatchNormalization()(dns1)
#drp2  = Dropout(0.2)(btch1)

out = Dense(y.shape[1], activation='softmax')(dns1)

In [68]:
model_1 = Model(inputs=inp, outputs=out)
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_1.summary())

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 150, 300)     73851900    input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 148, 100)     90100       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 147, 100)     120100      embedding_3[0][0]                
____________________________________________________________________________________________

In [76]:
history_1 = model_1.fit(sliced_X_train_pad, sliced_y, validation_split=0.1, verbose=1, epochs=1, batch_size=50, shuffle=True)

Train on 45000 samples, validate on 5000 samples
Epoch 1/1


Dev model using the split validation

In [77]:
y_hat = model_1.predict(X_test_pad)

In [86]:
y_hat

array([[9.99886513e-01, 1.13512004e-04],
       [6.58603339e-03, 9.93413985e-01],
       [9.99992490e-01, 7.46752630e-06],
       ...,
       [9.99996066e-01, 3.89573006e-06],
       [9.27640258e-08, 9.99999881e-01],
       [9.99997020e-01, 3.00331840e-06]], dtype=float32)

In [82]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))

0.981565172403299

Try on all twitter 

In [83]:
# get the test data on twitter
data_test = pd.read_csv('sentiment_tweets3.csv')

In [85]:
data_test = data_test[['message', 'label']]
data_test.drop(10313)

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
...,...,...
10308,Many sufferers of depression aren't sad; they ...,1
10309,No Depression by G Herbo is my mood from now o...,1
10310,What do you do when depression succumbs the br...,1
10311,Ketamine Nasal Spray Shows Promise Against Dep...,1


In [100]:
list_X_twit_test = list(data_test['message'].values)
list_tokenized_twit_test = t.texts_to_sequences(list_X_twit_test)
X_twit_test = pad_sequences(list_tokenized_twit_test, maxlen=SET_LIMIT_SENTENCE, padding='post')

In [103]:
y_twit_test =  pd.get_dummies(data_test['label']).values

In [104]:
y_hat_twit = model_1.predict(X_twit_test)
accuracy_score(list(map(lambda x: np.argmax(x), y_twit_test)), list(map(lambda x: np.argmax(x), y_hat_twit)))

0.9623812293969362

In [132]:
performance(y_twit_test, y_hat_twit)

----------------------------------------
Accuracy:  0.9623812293969362


Confusion Matrix: 
 [[7740  260]
 [ 128 2186]]


Precision:  0.8937040065412919
Recall:  0.9446845289541919
Area under ROC curve:  0.9560922644770959
----------------------------------------


### Design the CNN Multi Channel Kim Yoon

In [None]:
####################### Channel ONE ##################################
# channel 1 (CBOW)
inputs1 = Input(shape=(X_train_pad.shape[1],))
embedding1 = Embedding(vocab_size, EMBED_SIZE)(inputs1)
conv1 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = GlobalMaxPooling1D()(drop1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(embedding1)
drop1_2 = Dropout(0.5)(conv1_2)
glmp1_2 = GlobalMaxPooling1D()(drop1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=6, activation='relu')(embedding1)
drop1_3 = Dropout(0.5)(conv1_3)
glmp1_3 = GlobalMaxPooling1D()(drop1_3)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)

####################### Channel TWO ##################################
# channel 2 (SKIP-GRAM)
inputs1 = Input(shape=(X_train_pad.shape[1],))
embedding1 = Embedding(vocab_size, EMBED_SIZE)(inputs1)
conv1 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = GlobalMaxPooling1D()(drop1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(embedding1)
drop1_2 = Dropout(0.5)(conv1_2)
glmp1_2 = GlobalMaxPooling1D()(drop1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=6, activation='relu')(embedding1)
drop1_3 = Dropout(0.5)(conv1_3)
glmp1_3 = GlobalMaxPooling1D()(drop1_3)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)


# merge
merged = concatenate([flat1, flat2])
# interpretation
dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize
print(model.summary())

In [None]:
model_2 = Model(inputs=inp, outputs=out)
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_2.summary())