In [40]:
from numpy import array
from keras.preprocessing.sequence import pad_sequences
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.models import load_model
from nltk.tokenize import word_tokenize
import pandas as pd 
import numpy as np
import re
import tensorflow as tf
from matplotlib import pyplot as plt
from numpy import array
from numpy import asarray
from numpy import zeros

### Loading Pickle files 

In [41]:
train_df = pd.read_pickle('train.pkl')
train_df.name = 'Train Dataset'
train_df.head()

Unnamed: 0,hmid,moment,social,agency
0,27674,i happy my son got marks his examination,1,0
1,27685,went movies my friends it fun,1,1
2,27691,hot kiss my girl friend last night made my day,1,1
3,27701,my son woke me fantastic breakfast eggs his sp...,1,0
4,27712,my older daughter keeps patting my younger dau...,1,0


In [42]:
test_df = pd.read_pickle('unlabelled_data.pkl')
test_df.name = "Test Dataset"
test_df.head()

Unnamed: 0,hmid,moment
0,27673,i went successful date someone i felt sympathy...
1,27675,i went gym morning yoga
2,27678,i meditated last night
3,27679,i made new recipe peasant bread it came specta...
4,27680,i got gift my elder brother really surprising me


In [43]:
print(test_df.shape)
test_df.tail()

(69833, 2)


Unnamed: 0,hmid,moment
69828,128761,i spent time my daughter
69829,128762,my husband announced he getting decent bonus q...
69830,128763,pepsi drink
69831,128764,cuddling my girlfriend last night
69832,128766,i great workout last night


In [44]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69833 entries, 0 to 69832
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   hmid    69833 non-null  int64 
 1   moment  69833 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


Seperating the X and y features for both the train and test dataset

In [6]:
X_train = train_df['moment'].values
y_train_social = train_df['social'].values
y_train_agency = train_df['agency'].values

In [7]:
X_test = test_df['moment'].values
X_test

array(['i went successful date someone i felt sympathy connection ',
       'i went gym morning yoga ', 'i meditated last night ', ...,
       'pepsi drink ', 'cuddling my girlfriend last night ',
       'i great workout last night '], dtype=object)

In [8]:
X_train.shape, y_train_agency.shape, y_train_social.shape, X_test.shape

((9921,), (9921,), (9921,), (69833,))

#### Some statistical insights about our datasets

The TRAIN dataset

In [9]:
import statistics
length = 0
val = ""
train_avg = []
for sentence in X_train:
    sen_len = len(word_tokenize(sentence))
    train_avg.append(sen_len)
    if sen_len > length:
        length = sen_len
        val = sentence
print(f'Sentence : {val}\nLength :{length}')
print(f'Average Sentence Length : {sum(train_avg) / len(train_avg)}')
print(f'Standard Dev : {statistics.pstdev(train_avg)}') 

Sentence : it my th birthday i slept night i woke birthday morning i found my parents travelled my native my place my husband my kid arranged surprise birthday party i really moved my familys affection love they me made me really happy proud protected 
Length :43
Average Sentence Length : 9.144340288277391
Standard Dev : 4.58961222102695


The TEST dataset

In [10]:
length = 0
val = ""
test_avg = []
for sentence in X_test:
    sen_len = len(word_tokenize(sentence))
    test_avg.append(sen_len)
    if sen_len > length:
        length = sen_len
        val = sentence
print(f'Sentence : {val}\nLength :{length}')
print(f'Average Sentence Length : {sum(test_avg) / len(test_avg)}')

Sentence : morning i coming home shop i saw little boy standing quite looking parents who caring children buying something them little boy watching them i watching him i went him asked hey u little boy who you you standing he replied i lost my parents church i bring boy police station i told boy police all us sudden parents came embresed little boy i felt like something i something good describe me made my day i happy see them happy i really happy see smile face boy he looked me he smiled me gave fly kiss 
Length :95
Average Sentence Length : 9.073060014606275


#### The following code block is used to : 
- Convert texts to sequence
- Then pad these sequences based on the max_len variable which is 25 in our case
- The max_len = 25 variable was choosen based on the criteria that majority of the sentences did fit under this value, which eventually enables us to have a dense matrix.
- Values 9 and 10 were also considered as the max_len val but did not produce promising results. They were choosen based on the fact that the average length of a sentence in this corpora is 9.36 (after text pre-processing)

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 25

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [12]:
X_train.shape, X_test.shape

((9921, 25), (69833, 25))

#### Printing the top rows both text and numerical
- This gives us an idea of how the 1st value is converted to vector of 25 dimensions


In [13]:
print(test_df['moment'][0])
print(X_test[0])

i went successful date someone i felt sympathy connection 
[   1    8  690  217  266    1   84 2149    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


#### The following code block is used to build an embedding matrix with the following configuration

- Glove Embedding with 300 dimensions

In [14]:
embeddings_dictionary = dict()

glove_file = open('glove.6B/glove.6B.300d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

### Method to test the Accuracy, Precision, Recall and F1 Score


The following method transforms the output to binary values before calculating the scores

In [15]:
# def transformResults(predictions):
#     print("""The output here is of the form float, thus we are converting it to binary output based on the following 
# condition if a value is > .5 we assign it with 1 and if a value is <.5 we assign it with 0""")
    
#     print("\nConverting the ouput to 1D array, and then transformimg the values based on the above condition\n")
#     predictions = predictions[:, 0]
#     print(f'First 5 values before conversion : {predictions[:5]}')
    
#     predictions = [1 if val>0.5 else 0 for val in predictions]
#     print(f'First 5 values after the conversion : {predictions[:5]}')
#     return predictions

In [16]:
# def getResults(y_test, model_prediction):
#     model_prediction = transformResults(model_prediction)
#     print("\n========== RESULTS ===========\n")
#     accuracy = accuracy_score(y_test, model_prediction)
#     precision = precision_score(y_test, model_prediction)
#     f1 = f1_score(y_test, model_prediction)
#     recall = recall_score(y_test, model_prediction)
#     print(f'Accuracy : {accuracy}\nPrecision : {precision}\nF1_score : {f1}\nRecall : {recall}\n')
    
#     print("===== CONFUSION MATRIX =====")
#     cf_matrix = confusion_matrix(y_test_social, model_prediction)
#     print(cf_matrix)
    
#     return accuracy, precision, f1, recall

### Building the LSTM model 
with Embedding layer, LSTM layer and Dense layer with a single unit

This will be seperately evaluated for both the social and agency labels

In [17]:
def getLSTMModel():
    deep_inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(deep_inputs)
    LSTM_Layer_1 = LSTM(128)(embedding_layer)
    dense_layer_1 = Dense(1, activation='sigmoid')(LSTM_Layer_1)
    model = Model(inputs=deep_inputs, outputs=dense_layer_1)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    print(model.summary())
    return model

In [18]:
model = getLSTMModel()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 25, 300)           2178900   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,398,677
Trainable params: 219,777
Non-trainable params: 2,178,900
_________________________________________________________________
None


### Fit the model for the Social label

In [19]:
history = model.fit(X_train, y_train_social, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
social_predictions = model.predict(X_test)

In [21]:
social_predictions = social_predictions[:,0]

In [22]:
social_predictions

array([0.10023057, 0.02747902, 0.03188443, ..., 0.0282737 , 0.99396795,
       0.02831995], dtype=float32)

In [23]:
len(social_predictions)

69833

### Fit the model for the Agency Label

In [24]:
history = model.fit(X_train, y_train_agency, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
agency_predictions = model.predict(X_test)

In [26]:
agency_predictions = agency_predictions[:,0]
agency_predictions

array([0.9788944 , 0.99270964, 0.9705154 , ..., 0.99407226, 0.99158716,
       0.9912778 ], dtype=float32)

In [27]:
len(agency_predictions)

69833

In [28]:
def getBinary(val):
    if val > .5:
        return 'yes'
    else:
        return 'no'

In [29]:
social_predictions = pd.DataFrame(social_predictions, columns=['social'])
social_predictions['social'] = social_predictions['social'].apply(getBinary) 
social_predictions

Unnamed: 0,social
0,no
1,no
2,no
3,no
4,yes
...,...
69828,yes
69829,yes
69830,no
69831,yes


In [30]:
agency_predictions = pd.DataFrame(agency_predictions, columns=['agency'])
agency_predictions['agency'] = agency_predictions['agency'].apply(getBinary)
agency_predictions

Unnamed: 0,agency
0,yes
1,yes
2,yes
3,yes
4,no
...,...
69828,yes
69829,no
69830,yes
69831,yes


In [31]:
results = pd.read_pickle('unlabelled_data.pkl')

In [32]:
print(results.shape)
results

(69833, 2)


Unnamed: 0,hmid,moment
0,27673,i went successful date someone i felt sympathy...
1,27675,i went gym morning yoga
2,27678,i meditated last night
3,27679,i made new recipe peasant bread it came specta...
4,27680,i got gift my elder brother really surprising me
...,...,...
69828,128761,i spent time my daughter
69829,128762,my husband announced he getting decent bonus q...
69830,128763,pepsi drink
69831,128764,cuddling my girlfriend last night


In [33]:
results['social'] = social_predictions['social']
results

Unnamed: 0,hmid,moment,social
0,27673,i went successful date someone i felt sympathy...,no
1,27675,i went gym morning yoga,no
2,27678,i meditated last night,no
3,27679,i made new recipe peasant bread it came specta...,no
4,27680,i got gift my elder brother really surprising me,yes
...,...,...,...
69828,128761,i spent time my daughter,yes
69829,128762,my husband announced he getting decent bonus q...,yes
69830,128763,pepsi drink,no
69831,128764,cuddling my girlfriend last night,yes


In [34]:
results['agency'] = agency_predictions['agency']
results

Unnamed: 0,hmid,moment,social,agency
0,27673,i went successful date someone i felt sympathy...,no,yes
1,27675,i went gym morning yoga,no,yes
2,27678,i meditated last night,no,yes
3,27679,i made new recipe peasant bread it came specta...,no,yes
4,27680,i got gift my elder brother really surprising me,yes,no
...,...,...,...,...
69828,128761,i spent time my daughter,yes,yes
69829,128762,my husband announced he getting decent bonus q...,yes,no
69830,128763,pepsi drink,no,yes
69831,128764,cuddling my girlfriend last night,yes,yes


### Converting the output 'yes' and 'no' to binary values

In [47]:
results['social'] = results['social'].replace({'yes':1,'no':0})
results

Unnamed: 0,hmid,moment,social,agency
0,27673,i went successful date someone i felt sympathy...,0,yes
1,27675,i went gym morning yoga,0,yes
2,27678,i meditated last night,0,yes
3,27679,i made new recipe peasant bread it came specta...,0,yes
4,27680,i got gift my elder brother really surprising me,1,no
...,...,...,...,...
69828,128761,i spent time my daughter,1,yes
69829,128762,my husband announced he getting decent bonus q...,1,no
69830,128763,pepsi drink,0,yes
69831,128764,cuddling my girlfriend last night,1,yes


In [48]:
results['agency'] = results['agency'].replace({'yes':1,'no':0})
results

Unnamed: 0,hmid,moment,social,agency
0,27673,i went successful date someone i felt sympathy...,0,1
1,27675,i went gym morning yoga,0,1
2,27678,i meditated last night,0,1
3,27679,i made new recipe peasant bread it came specta...,0,1
4,27680,i got gift my elder brother really surprising me,1,0
...,...,...,...,...
69828,128761,i spent time my daughter,1,1
69829,128762,my husband announced he getting decent bonus q...,1,0
69830,128763,pepsi drink,0,1
69831,128764,cuddling my girlfriend last night,1,1


In [49]:
pd.to_pickle(results, 'predicted_unlabeled_data.pkl')