In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import operator
from sklearn.model_selection import train_test_split
from glove import Corpus, Glove
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# read the dataset 
data = pd.read_csv("dataset.csv")
print(data.shape)
print(data.head())

(10560, 12)
    hmid                                             moment          concepts  \
0  27674  I was happy when my son got 90% marks in his e...  education|family   
1  27685          went to movies with my friends it was fun     entertainment   
2  27691  A hot kiss with my girl friend last night made...           romance   
3  27701  My son woke me up to a fantastic breakfast of ...       family|food   
4  27712  My older daughter keeps patting my younger dau...            family   

  agency social   age country gender  married parenthood reflection  \
0     no    yes  29.0     IND      m  married          y        24h   
1    yes    yes  29.0     IND      m   single          y        24h   
2    yes    yes  25.0     IND      m  married          y        24h   
3     no    yes    79     USA      f  widowed          y        24h   
4     no    yes    30     USA      f  married          y        24h   

                      duration  
0                   half_a_day  
1       

In [3]:
# Get Moment, agency and social
target_data = data[['moment', 'agency', 'social']]
print(target_data)

                                                  moment agency social
0      I was happy when my son got 90% marks in his e...     no    yes
1              went to movies with my friends it was fun    yes    yes
2      A hot kiss with my girl friend last night made...    yes    yes
3      My son woke me up to a fantastic breakfast of ...     no    yes
4      My older daughter keeps patting my younger dau...     no    yes
5          I cooked my girlfriend a wonderful breakfast.    yes    yes
6         My Mother gave me a surprise visit at my home.     no    yes
7      There was hardly any traffic on my way to work...    yes     no
8                     I came to my office at right time.    yes     no
9      The day I got my degree in industrial engineering    yes     no
10     I went to office hour of one of my professors,...    yes    yes
11     We all ladies member from my family went for a...    yes    yes
12     When my wife came home from work and we shared...    yes    yes
13    

In [4]:
# Data Analysis
global agency
global social
agency = list(target_data['agency'])
moments = list(target_data['moment'])
social = list(target_data['social'])
yy = 0
yn = 0
nn = 0
ny = 0
# print("Len: ",len(moments))
for i in range(len(moments)):
    if agency[i] == 'yes' and social[i] =='yes':
        yy += 1
    elif agency[i] == 'yes' and social[i] == 'no':
        yn += 1
    elif agency[i] == 'no' and social[i] =='yes':
        ny += 1
    else:
        nn += 1

print("------------------ Social -------------------")
print("               YES        NO      SUM")
print("Agency YES    ",yy,"     ",yn,"  ",yy+yn)
print("       NO     ",ny,"     ",nn,"   ",ny+nn)
print("       SUM    ",yy+ny,"     ",yn+nn,"  ",yy+nn+ny+yn)


------------------ Social -------------------
               YES        NO      SUM
Agency YES     3554       4242    7796
       NO      2071       693     2764
       SUM     5625       4935    10560


In [5]:
# Observation: Data is positive in high proportion although there is an imbalance as more positive data
# for social than for agency. This might result in poor accuracy while predicting the label agency.

In [6]:
from nltk.tokenize import RegexpTokenizer
import statistics
def tokenizer(sentence):
    tokenizer = RegexpTokenizer('[a-zA-Z0-9\']+')
    words = tokenizer.tokenize(sentence)
    return words

In [7]:
from nltk.corpus import stopwords
def stopWordRemoval(words):
    filtered_data = []
    stop_words = set(stopwords.words('english'))
    for word in words:
        if not word in stop_words:
            if word.isnumeric() or word.isalpha():
                filtered_data.append(word)
    return filtered_data
    

In [8]:
# Data Pre-processing
global sentence_list
global sequenced_vocab
global sentences
global vocab
# 1. Split the sentences in moments into words (using regex tokenizer)
vocab = list()
sequenced_vocab = dict()
index = 1
sentence_list = list()
seq_list = list()
sentences = list()

for i in range(len(moments)):
    moments[i] = moments[i].lower()
    bag_of_words = tokenizer(moments[i])
    #remove stop words
#     bag_of_words = stopWordRemoval(bag_of_words)
    sentence_list.append(bag_of_words)
    vocab += bag_of_words

# sentences = sentence_list

# Form sequence dictionary and convert each sentence into a number sequence 
for word in vocab:
    if word not in sequenced_vocab.keys():
        sequenced_vocab[word] = index
        index += 1
print("Unique words in Vocabulary : ",len(sequenced_vocab))
# print("Sequenced Vocab : ")
# print(sequenced_vocab)

# Avg length of each sentence
sentence_length = [ len(sentence_list[i]) for i in range(len(sentence_list))]
sentence_length.sort()
print("------------------ SENTENCE STATISTICS OF MOMENTS -----------------")
print("Minimum Length : ",sentence_length[0])
print("Maximum Length : ",sentence_length[len(sentence_length)-1])
print("Average Length : ",round(sum(sentence_length)/len(sentence_length)))
print("Median Length  : ", round(statistics.median(sentence_length)))
# print(sentence_list)

Unique words in Vocabulary :  7604
------------------ SENTENCE STATISTICS OF MOMENTS -----------------
Minimum Length :  2
Maximum Length :  70
Average Length :  13
Median Length  :  12


In [9]:
# 2. Assign a sequence of numbers to each of the sentences 
global sequenced_vocab
global sentence_list
for i in range(len(sentence_list)):
    for word in sentence_list[i]:
        if word in sequenced_vocab.keys():
            seq_list.append(sequenced_vocab[word])
    sentence_list[i] = seq_list
    seq_list = []
    
    
#Printing Sentence Stats
count_len = dict()
for length in sentence_length:
    if length not in count_len.keys():
        count_len[length] = 1
    else:
        count_len[length] += 1

print("Total Sentences : ",len(sentence_list))
print("Length of majority sentences: ",max(count_len.items(), key=operator.itemgetter(1))[0])

# # 3. Pad with 0s if sentence length is not same (at the beginning)
padded_sentence = pad_sequences(sentence_list, maxlen=20)
print(" ================= PADDED SENTENCE =================")
print(padded_sentence[9999])

Total Sentences :  10560
Length of majority sentences:  8
[  0   0   0   0   0   0   0   0   0   0   0   1 337  20 209 338  78   2
 333 736]


In [10]:
#transform agency and social to binary labels
global agency_label
global social_label
agency_label = [ 0 if agency[i] == "no" else 1 for i in range(len(agency))]
social_label = [ 0 if social[i] == "no" else 1 for i in range(len(social))]
# print(agency_label)
# print(social_label)

In [11]:
def createEmbMatrix(vectors, word_dict):
  max_words = len(list(word_dict.values()))
  embedding_matrix = np.zeros((max_words, 100))
  for i in range(max_words):
    embedding_vector = vectors[i]
    embedding_matrix[i] = embedding_vector
  return embedding_matrix

In [12]:
def runCNNModel(sentences_train, y_train, sentences_val, y_val, sentences_test, y_test):
  model = Sequential()
  model.add(Embedding(7920,100,input_length=20))
  model.add(layers.Dropout(0.2))
  model.add(layers.Conv1D(64,3,padding='valid',activation='relu',strides=1))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(256))
  # model.add(layers.Dropout(0.2))
  model.add(layers.Activation('relu'))
  model.add(layers.Dense(1))
  model.add(layers.Activation('sigmoid'))
  model.summary()
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

  history = model.fit(np.array(sentences_train), np.array(y_train),
                      epochs=10,
                      batch_size=32,
                      verbose=1,
                      validation_data=(np.array(sentences_test), np.array(y_test)))
  loss, accuracy = model.evaluate(np.array(sentences_test), np.array(y_test), verbose=1)
  print("Loss: ", loss)
  print("Accuracy : ",accuracy*100)

In [13]:
def gloveCNNModel(sentences_train, y_train, sentences_val, y_val, sentences_test, y_test, embedding_matrix):
  model = Sequential()
  model.add(Embedding(7605,100,weights=[embedding_matrix],input_length=20,trainable=False))
  model.add(layers.Dropout(0.2))
  model.add(layers.Conv1D(64,3,padding='valid',activation='relu',strides=1))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(256))
#   model.add(layers.Dropout(0.2))
  model.add(layers.Activation('relu'))
  model.add(layers.Dense(1))
  model.add(layers.Activation('sigmoid'))
  model.summary()
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

  history = model.fit(np.array(sentences_train), np.array(y_train),
                      epochs=10,
                      batch_size=32,
                      verbose=1,
                      validation_data=(np.array(sentences_val), np.array(y_val)))
  loss, accuracy = model.evaluate(np.array(sentences_test), np.array(y_test), verbose=1)
  print("Loss: ", loss)
  print("Accuracy : ",accuracy*100)

In [15]:
#Using Trained Embedding GLoVe (on the sentences) for the Embedding Layer
global sentence_list
sentence_list[0].insert(0,0)
corpus = Corpus()
corpus.fit(sentence_list, window=5)
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=10, no_threads=4, verbose=False)
# Add the object to the dictionary
glove.save('glove.model')
glove.add_dictionary(corpus.dictionary)
print("VOCAB SIZE: ",len(list(glove.dictionary.keys())))
vocab_size = len(list(glove.dictionary.keys()))
# print(glove.dictionary)

# Glove Embedding Matrix
embedding_matrix = createEmbMatrix(glove.word_vectors, glove.dictionary)
print("Embedding Matrix : ",embedding_matrix.shape)

print("================================ For SOCIAL ==============================")
# Split the data into train and validate
sentences_train,sentences_val,y_train,y_val = train_test_split(
                                                padded_sentence.tolist(), social_label,  
                                                test_size=0.40,  
                                                random_state=1000)
sentences_validate, sentences_test, y_validate, y_test = train_test_split(
                                                sentences_val, y_val,  
                                                test_size=0.20)

print("----------- TRAIN ---------------")
print("Shape: ",len(sentences_train))
print("----------- VALIDATE --------------")
print("Shape: ",len(sentences_test))

# CNN for Social
runCNNModel(sentences_train, y_train, sentences_validate, y_validate, sentences_test, y_test)
print("Using Glove trained on Model ===> ")
gloveCNNModel(sentences_train, y_train,sentences_validate, y_validate, sentences_test, y_test, embedding_matrix)

print("============================= For AGENCY =================================")
sentences_train,sentences_val,y_train,y_val = train_test_split(
                                                padded_sentence.tolist(), agency_label,  
                                                test_size=0.40,  
                                                random_state=1000)
sentences_validate, sentences_test, y_validate, y_test = train_test_split(
                                                sentences_val, y_val,  
                                                test_size=0.20)
print("----------- TRAIN ---------------")
print("Shape: ",len(sentences_train))
print("----------- VALIDATE --------------")
print("Shape: ",len(sentences_test))
runCNNModel(sentences_train, y_train, sentences_validate, y_validate, sentences_test, y_test)
print("Using Glove trained on Model ===> ")
gloveCNNModel(sentences_train, y_train, sentences_validate, y_validate, sentences_test, y_test, embedding_matrix)

VOCAB SIZE:  7605
Embedding Matrix :  (7605, 100)
----------- TRAIN ---------------
Shape:  6336
----------- VALIDATE --------------
Shape:  845
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 100)           792000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 18, 64)            19264     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               16640     
_________________________________________________________________
activation_3 (Activation)    (None, 256)               0       

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.7804342894159125
Accuracy :  83.07692309808449
Using Glove trained on Model ===> 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 100)           760500    
_________________________________________________________________
dropout_5 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 18, 64)            19264     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 256)               16640     
_________________________________________________________________
activation_9 (Activation)    (None, 256)          