In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# split
from sklearn.model_selection import StratifiedShuffleSplit

# from spam
from collections import Counter

# word embedding
from gensim.models import Word2Vec
import multiprocessing

# count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# keras
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.merge import concatenate
from keras.layers import Dense, GlobalMaxPooling1D, Activation, Dropout, Embedding, Input, Conv1D
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# score
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score

In [2]:
data = pd.read_csv('../X_train.csv')

In [3]:
data.dropna(inplace = True)

In [18]:
sentences = list(data['tokenized_text'].apply(lambda x: x.split(' ')))

### Train a word embedding using CBOW

In [20]:
EMBED_DIM = 300
emb = Word2Vec(sentences, size=EMBED_DIM, window=3, 
               min_count=3, negative=15, iter=1, 
               workers=multiprocessing.cpu_count())
# get the word vector
word_vec = emb.wv

KeyboardInterrupt: 

In [None]:
print(emb)

In [None]:
emb.save('../CBOW300.bin')

In [3]:
# load model
new_model = Word2Vec.load('../CBOW300.bin')
print(new_model)

Word2Vec(vocab=95581, size=300, alpha=0.025)


In [None]:
EMBED_DIMS = [200, 400, 600, 800, 1000]
for dim in EMBED_DIMS:
    emb = Word2Vec(sentences, size=dim, window=3, 
                   min_count=3, negative=15, iter=1, 
                   workers=multiprocessing.cpu_count())
    emb.save(f'../CBOW{dim}.bin')

### Train word embedding using skip-gram

In [73]:
EMBED_DIM = 300
emb = Word2Vec(sentences, size=EMBED_DIM, window=3, 
               min_count=3, negative=15, iter=1, sg=1, 
               workers=multiprocessing.cpu_count())
# get the word vector
word_vec = emb.wv

In [None]:
print(emb)

In [74]:
emb.save('../SKIP-GRAM300.bin')

In [4]:
# load model
new_model_2 = Word2Vec.load('../CBOW300.bin')
print(new_model_2)

Word2Vec(vocab=95581, size=300, alpha=0.025)


### Split the train to train and dev

In [4]:
y = pd.DataFrame(data['mental_state'])
X = data[['tokenized_text']].copy()

In [5]:
X.reset_index(inplace = True)
y.reset_index(inplace = True)

In [6]:
X.drop(labels='index', axis=1, inplace = True)
y.drop(labels='index', axis=1, inplace = True)

Source: https://www.kaggle.com/danielsafai/cnn-implementation-of-yoon-kim-s-model

In [7]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_dev = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_dev = y.loc[train_index], y.loc[test_index]

### Performance Template

In [8]:
def performance(y_true, y_hat):
    
    y_true = list(map(lambda x: np.argmax(x), y_true))
    y_hat = list(map(lambda x: np.argmax(x), y_hat))
    
    print('-'*40)
    # accuracy
    print('Accuracy: ', accuracy_score(y_true,y_hat))
    # confusion matrix
    print('\n')
    print('Confusion Matrix: \n', confusion_matrix(y_true,y_hat))
    print('\n')
    # precision score of the model 
    print('Precision: ', precision_score(y_true, y_hat))
    # recall score of the model 
    print('Recall: ', recall_score(y_true, y_hat))
    # area under the ROC curve
    print('Area under ROC curve: ', roc_auc_score(y_true, y_hat))
    print('-'*40)


### Train Preparation

In [10]:
#CBOW
word_vec = new_model.wv
#SKIP GRAM
word_vec_2 = new_model_2.wv

In [9]:
list_X_train = list(X_train['tokenized_text'].values)
list_X_dev = list(X_dev['tokenized_text'].values)

In [10]:
# set the parameters
EMBED_SIZE = 300
#MAX_WORDS = 85971
#MAX_WORDS_IN_SENT = 4640
SET_LIMIT_SENTENCE = 150

t = Tokenizer()
t.fit_on_texts(list_X_train)
vocab_size = len(t.word_index) + 1

list_tokenized_train = t.texts_to_sequences(list_X_train)
list_tokenized_test = t.texts_to_sequences(list_X_dev)

In [11]:
X_train_pad = pad_sequences(list_tokenized_train, maxlen=SET_LIMIT_SENTENCE, padding='post')
X_test_pad = pad_sequences(list_tokenized_test, maxlen=SET_LIMIT_SENTENCE, padding='post')

In [12]:
# get the dummy for y values
y = pd.get_dummies(y_train['mental_state']).values
y_test = pd.get_dummies(y_dev['mental_state']).values

In [13]:
sliced_X_train_pad = np.concatenate((X_train_pad[:50000], X_train_pad[-50000:]))
sliced_y = np.concatenate((y[:50000], y[-50000:]))

### Open Glove 

In [40]:
import zipfile
with zipfile.ZipFile('../glove.42B.300d.zip', 'r') as zip_ref:
    zip_ref.extractall('../glove.42B.300d.txt')

In [14]:
embedding_vector = {}
f = open('../glove.42B.300d.txt/glove.42B.300d.txt')

# get the mapping words and coefficient here
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

# create the embedding matrix
embedding_matrix = np.zeros((vocab_size,300))
# check if the word exist in tokenizer
for word,i in tqdm(t.word_index.items()):
    embedding_value = embedding_vector.get(word)
    # some words may not exist in Glove
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

1917495it [02:59, 10688.46it/s]
100%|██████████| 242825/242825 [00:01<00:00, 210762.44it/s]


### This is for CBOW and Skip-gram

In [17]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 300))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        try:
            weight_matrix[i] = embedding[word]
        except:
            pass
    return weight_matrix

# get vectors in the right order
embedding_vectors = get_weight_matrix(word_vec, t.word_index)
embedding_vectors_2 = get_weight_matrix(word_vec_2, t.word_index)

### Design the CNN Non-Static Kim Yoon

In [16]:
inp = Input(shape=(X_train_pad.shape[1],), dtype='int64')

# set the embedding layer (use Glove)
emb = Embedding(vocab_size, EMBED_SIZE, weights=[embedding_matrix], trainable=False)(inp)
conv_filters = 10

# Specify each convolution layer and their kernel size i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
glmp1_1 = GlobalMaxPooling1D()(conv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(emb)
glmp1_2 = GlobalMaxPooling1D()(conv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(emb)
glmp1_3 = GlobalMaxPooling1D()(conv1_3)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)
drp1 = Dropout(0.5)(cnct)

# fully connected layer
dns1  = Dense(15, activation='relu')(drp1)
drp_last  = Dropout(0.2)(dns1)

# softmax
out = Dense(y.shape[1], activation='softmax', kernel_regularizer='l2')(drp_last)

In [17]:
model_1 = Model(inputs=inp, outputs=out)
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_1.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     72847800    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 148, 10)      9010        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 147, 10)      12010       embedding_1[0][0]                
______________________________________________________________________________________________

In [None]:
history_1 = model_1.fit(sliced_X_train_pad, sliced_y, validation_data=(X_test_pad, y_test), verbose=2, 
                        epochs=1, batch_size=32, shuffle=True)


In [21]:
sliced_X_train_pad.shape

(50000, 150)

### Result

Predict model using validation set

In [35]:
y_hat = model_1.predict(X_test_pad)

In [86]:
y_hat

array([[9.99886513e-01, 1.13512004e-04],
       [6.58603339e-03, 9.93413985e-01],
       [9.99992490e-01, 7.46752630e-06],
       ...,
       [9.99996066e-01, 3.89573006e-06],
       [9.27640258e-08, 9.99999881e-01],
       [9.99997020e-01, 3.00331840e-06]], dtype=float32)

In [36]:
performance(y_test, y_hat)

----------------------------------------
Accuracy:  0.9804236337902174


Confusion Matrix: 
 [[76932  2211]
 [  984 83080]]


Precision:  0.974076983503535
Recall:  0.9882946326608298
Area under ROC curve:  0.980178929991762
----------------------------------------


Predict model on test set

In [48]:
# get the test data on twitter
data_test = pd.read_pickle('../twitter_test.pickle')

In [45]:
data_test = data_test[['tokenized_text', 'label']]

In [51]:
data_test.reset_index(inplace = True)

In [55]:
data_test['text'] = data_test['tokenized_text'].apply(lambda x: " ".join(x))

In [57]:
list_X_twit_test = list(data_test['text'].values)
list_tokenized_twit_test = t.texts_to_sequences(list_X_twit_test)
X_twit_test = pad_sequences(list_tokenized_twit_test, maxlen=SET_LIMIT_SENTENCE, padding='post')

In [58]:
y_twit_test =  pd.get_dummies(data_test['label']).values

In [59]:
y_hat_twit = model_1.predict(X_twit_test)
#accuracy_score(list(map(lambda x: np.argmax(x), y_twit_test)), list(map(lambda x: np.argmax(x), y_hat_twit)))

In [60]:
performance(y_twit_test, y_hat_twit)

----------------------------------------
Accuracy:  0.9629593716668283


Confusion Matrix: 
 [[7765  235]
 [ 147 2166]]


Precision:  0.9021241149521033
Recall:  0.9364461738002594
Area under ROC curve:  0.9535355869001296
----------------------------------------


### Design the CNN Multi Channel Kim Yoon

In [20]:
conv_filters = 100

In [21]:
####################### Channel ONE ##################################
# channel 1 (CBOW)
inputs1 = Input(shape=(X_train_pad.shape[1],))
embedding1 = Embedding(vocab_size, EMBED_SIZE, weights=[embedding_vectors])(inputs1)
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(embedding1)
drop1_1 = Dropout(0.5)(conv1_1)
glmp1_1 = GlobalMaxPooling1D()(drop1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(embedding1)
drop1_2 = Dropout(0.5)(conv1_2)
glmp1_2 = GlobalMaxPooling1D()(drop1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(embedding1)
drop1_3 = Dropout(0.5)(conv1_3)
glmp1_3 = GlobalMaxPooling1D()(drop1_3)

# Gather all convolution layers
cnct_1 = concatenate([glmp1_1, glmp1_2, glmp1_3], axis=1)
drp1 = Dropout(0.5)(cnct_1)

####################### Channel TWO ##################################
# channel 2 (SKIP-GRAM)
inputs2 = Input(shape=(X_train_pad.shape[1],))
embedding2 = Embedding(vocab_size, EMBED_SIZE, weights=[embedding_vectors_2])(inputs2)
conv2_1 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(embedding2)
drop2_1 = Dropout(0.5)(conv2_1)
glmp2_1 = GlobalMaxPooling1D()(drop2_1)

conv2_2 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(embedding2)
drop2_2 = Dropout(0.5)(conv2_2)
glmp2_2 = GlobalMaxPooling1D()(drop2_2)

conv2_3 = Conv1D(filters=conv_filters, kernel_size=5, activation='relu')(embedding2)
drop2_3 = Dropout(0.5)(conv2_3)
glmp2_3 = GlobalMaxPooling1D()(drop2_3)

# Gather all convolution layers
cnct_2 = concatenate([glmp2_1, glmp2_2, glmp2_3], axis=1)
drp2 = Dropout(0.5)(cnct_2)

#################### Combine both channel ##############################
# merge
merged = concatenate([drp1, drp2])

# interpretation
dense1 = Dense(100, activation='relu')(merged)
drp_last  = Dropout(0.2)(dense1)
outputs = Dense(y.shape[1], activation='softmax', kernel_regularizer='l2')(drp_last)

In [22]:
model_2 = Model(inputs=[inputs1, inputs2], outputs=outputs)
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_2.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     83672700    input_2[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     83672700    input_3[0][0]                    
____________________________________________________________________________________________

In [None]:
history_2 = model_2.fit([sliced_X_train_pad,sliced_X_train_pad] , sliced_y, 
                        validation_data=([X_test_pad, X_test_pad], y_test), verbose=1, 
                        epochs=1, batch_size=64, shuffle=True)

### Result

Predict on extra validation set

In [None]:
y_hat = model_2.predict(X_test_pad)
performance(y_test, y_hat)

Predict on test dataset

In [None]:
y_hat_twit = model_1.predict(X_twit_test)
performance(y_twit_test, y_hat_twit)