In [1]:
import numpy as np
import pandas as pd

import gensim
from keras import backend as K
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, CuDNNLSTM, Embedding, Bidirectional
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
df = pd.read_csv('D:/Datasets/mc-sent-2/dataset/hm_train.csv', low_memory=False)
df.cleaned_hm = df.cleaned_hm.apply(str.lower)
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,i went on a successful date with someone i fel...,1,affection
1,27674,24h,i was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,i went to the gym this morning and did yoga.,1,exercise
3,27676,24h,we had a serious talk with some friends of our...,2,bonding
4,27677,24h,i went with grandchildren to butterfly display...,1,affection


In [3]:
labels = df.predicted_category
df.drop(['reflection_period', 'num_sentence', 'predicted_category'], axis=1, inplace=True)
classes = sorted(labels.unique())

In [4]:
classes

['achievement',
 'affection',
 'bonding',
 'enjoy_the_moment',
 'exercise',
 'leisure',
 'nature']

In [5]:
df_train, df_val, y_train, y_val = train_test_split(df, labels, test_size=0.2, random_state=7)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.cleaned_hm)
num_words = len(tokenizer.word_index) + 1

In [7]:
encoded_train_set = tokenizer.texts_to_sequences(df_train.cleaned_hm)
len(encoded_train_set)

48256

In [8]:
df_train['tokens'] = encoded_train_set
df_train.drop(['cleaned_hm'], axis=1, inplace=True)
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,hmid,tokens
13715,41471,"[674, 777, 4837, 9, 1775]"
571,28244,"[1, 23, 5, 404, 12, 2, 210]"
49217,77149,"[183, 2, 4213, 185]"
7125,34848,"[1, 66, 3, 528, 713, 8, 3, 307, 3941, 81, 432,..."
47654,75580,"[1, 380, 78, 3, 38, 16, 6, 3224, 13, 1, 35, 65..."


In [9]:
y_train.head()

13715    achievement
571        affection
49217      affection
7125     achievement
47654        bonding
Name: predicted_category, dtype: object

Let maximum sequence length = 100 words
<br>
Zero-pad the remaining sentence

In [10]:
max_len = 100
padded = pad_sequences(encoded_train_set, maxlen=max_len, padding='post')
trainset = [list(doc) for doc in padded]
df_train['tokens'] = trainset
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,hmid,tokens
13715,41471,"[674, 777, 4837, 9, 1775, 0, 0, 0, 0, 0, 0, 0,..."
571,28244,"[1, 23, 5, 404, 12, 2, 210, 0, 0, 0, 0, 0, 0, ..."
49217,77149,"[183, 2, 4213, 185, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
7125,34848,"[1, 66, 3, 528, 713, 8, 3, 307, 3941, 81, 432,..."
47654,75580,"[1, 380, 78, 3, 38, 16, 6, 3224, 13, 1, 35, 65..."


### Prepare validation data

In [11]:
encoded_val_set = tokenizer.texts_to_sequences(df_val.cleaned_hm)
len(encoded_val_set)

12065

In [12]:
df_val['tokens'] = encoded_val_set
df_val.drop(['cleaned_hm'], axis=1, inplace=True)
df_val.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,hmid,tokens
19748,47530,"[1, 66, 3, 160, 1113, 365, 16, 411, 54]"
26811,54629,"[1, 17, 5, 685, 2734, 6, 2072]"
52605,80546,"[1, 91, 2, 130, 3, 27, 4123]"
7848,35577,"[518, 42, 59, 19, 2903, 13, 1, 19, 74, 85, 42,..."
3435,31130,"[2, 232, 7, 613, 8, 6, 328]"


In [13]:
padded_val = pad_sequences(encoded_val_set, maxlen=max_len, padding='post')
valset = [list(doc) for doc in padded_val]
df_val['tokens'] = valset;
df_val.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,hmid,tokens
19748,47530,"[1, 66, 3, 160, 1113, 365, 16, 411, 54, 0, 0, ..."
26811,54629,"[1, 17, 5, 685, 2734, 6, 2072, 0, 0, 0, 0, 0, ..."
52605,80546,"[1, 91, 2, 130, 3, 27, 4123, 0, 0, 0, 0, 0, 0,..."
7848,35577,"[518, 42, 59, 19, 2903, 13, 1, 19, 74, 85, 42,..."
3435,31130,"[2, 232, 7, 613, 8, 6, 328, 0, 0, 0, 0, 0, 0, ..."


In [14]:
inputs = Input(shape=(max_len,), dtype='int32')
embedding = Embedding(num_words, 200, input_length=max_len, trainable=True)(inputs)
x = CuDNNLSTM(256, return_sequences=True)(embedding)
x = CuDNNLSTM(64)(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(7, activation='softmax')(x)
model = Model(inputs, outputs)

In [15]:
x_train = np.array([np.array(x) for x in df_train.tokens])
x_val = np.array([np.array(x) for x in df_val.tokens])
print(x_train.shape, x_val.shape)

label_to_ohv = dict()
for i, cls in enumerate(classes):
    ohv = np.zeros((7), dtype='int8')
    ohv[i] = 1
    label_to_ohv[cls] = tuple(ohv)
    
ohv_to_label = dict()
for k, v in label_to_ohv.items():
    ohv_to_label[v] = k

y_train = np.array([np.array(label_to_ohv[label]) for label in y_train])
y_val = np.array([np.array(label_to_ohv[label]) for label in y_val])
print(y_train.shape, y_val.shape)

(48256, 100) (12065, 100)
(48256, 7) (12065, 7)


In [16]:
checkpoint = ModelCheckpoint('D:/Datasets/mc-sent-2/embedding_v1.h5', save_best_only=True, monitor='val_categorical_accuracy', mode='max')
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 200)          3885200   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 100, 256)          468992    
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 64)                82432     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 455       
Total params: 4,441,239
Trainable params: 4,441,239
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint],
          epochs=12,
          verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x1ee4afe69e8>

## Testing
Load latest checkpoint model

In [18]:
model = load_model('D:/Datasets/mc-sent-2/embedding_v1.h5')

In [19]:
df_test = pd.read_csv('D:/Datasets/mc-sent-2/dataset/hm_test.csv', low_memory=False)
df_test.drop(['reflection_period', 'num_sentence'], axis=1, inplace=True)
df_test.cleaned_hm = df_test.cleaned_hm.apply(str.lower)
df_test.head()

Unnamed: 0,hmid,cleaned_hm
0,88305,i spent the weekend in chicago with my friends.
1,88306,we moved back into our house after a remodel. ...
2,88307,my fiance proposed to me in front of my family...
3,88308,i ate lobster at a fancy restaurant with some ...
4,88309,i went out to a nice restaurant on a date with...


In [20]:
encoded_test_set = tokenizer.texts_to_sequences(df_test.cleaned_hm)
len(encoded_test_set)

40213

In [21]:
df_test['tokens'] = encoded_test_set
df_test.drop(['cleaned_hm'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,tokens
0,88305,"[1, 207, 6, 178, 9, 1909, 12, 2, 48]"
1,88306,"[21, 435, 105, 145, 52, 117, 44, 3, 4923, 21, ..."
2,88307,"[2, 709, 1799, 5, 10, 9, 594, 11, 2, 50, 9, 6,..."
3,88308,"[1, 165, 4752, 20, 3, 1563, 251, 12, 42, 48]"
4,88309,"[1, 23, 29, 5, 3, 87, 251, 16, 3, 327, 12, 2, ..."


In [22]:
padded_test = pad_sequences(encoded_test_set, maxlen=max_len, padding='post')
testset = [list(doc) for doc in padded_test]
df_test['tokens'] = testset;
df_test.head()

Unnamed: 0,hmid,tokens
0,88305,"[1, 207, 6, 178, 9, 1909, 12, 2, 48, 0, 0, 0, ..."
1,88306,"[21, 435, 105, 145, 52, 117, 44, 3, 4923, 21, ..."
2,88307,"[2, 709, 1799, 5, 10, 9, 594, 11, 2, 50, 9, 6,..."
3,88308,"[1, 165, 4752, 20, 3, 1563, 251, 12, 42, 48, 0..."
4,88309,"[1, 23, 29, 5, 3, 87, 251, 16, 3, 327, 12, 2, ..."


In [23]:
x_test = np.array([np.array(x) for x in df_test.tokens])
print(x_test.shape)

(40213, 100)


In [24]:
preds = model.predict(x_test, batch_size=32, verbose=2)
preds.shape

(40213, 7)

In [25]:
ohvs = []
for pred in preds:
    ohv = np.zeros((7), dtype='int8')
    ohv[np.argmax(pred)] = 1
    ohvs.append(ohv)

In [27]:
predictions = [ohv_to_label[tuple(vec)] for vec in ohvs]
df_test['predicted_category'] = predictions

In [30]:
df_test.drop(['tokens'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection


In [31]:
df_test.to_csv('D:/Datasets/mc-sent-2/sub_2.csv', index=False)