In [41]:
import pandas as pd 
import numpy as np

from tensorflow.python.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.python.keras.layers import Conv2D, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Input
from tensorflow.keras.models import load_model

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import tensorflow as tf
tf.config.run_functions_eagerly(True)

In [43]:
train_df = pd.read_pickle('train.pkl')
train_df.name = 'Train Dataset'
train_df.head()

Unnamed: 0,hmid,moment,social,agency
0,27674,i happy my son got marks his examination,1,0
1,27685,went movies my friends it fun,1,1
2,27691,hot kiss my girl friend last night made my day,1,1
3,27701,my son woke me fantastic breakfast eggs his sp...,1,0
4,27712,my older daughter keeps patting my younger dau...,1,0


In [44]:
test_df = pd.read_pickle('test.pkl')
test_df.name = "Test Dataset"
test_df.head()

Unnamed: 0,hmid,moment,social,agency
0,0,i able spend day my fiance shopping yesterday,0,1
1,1,i able play my cat,0,1
2,2,i able clean my room gold my laundry,0,1
3,3,i spend day party beach i happy see friends i ...,1,1
4,4,my cat greeting me i got home work she sweet c...,0,0


In [45]:
X_train = train_df['moment'].values
y_train_social = train_df['social'].values
y_train_agency = train_df['agency'].values

In [46]:
X_test = test_df['moment'].values
y_test_social = test_df['social'].values
y_test_agency = test_df['agency'].values

###### using the CountVectorizer provided by the scikit-learn library to vectorize sentences. 

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(X_train)
vectorizer.vocabulary_
X_train  = vectorizer.transform(X_train).toarray()

In [48]:
X_test  = vectorizer.transform(X_test).toarray()
X_test.shape,X_train.shape

((17215, 7246), (9921, 7246))

In [49]:
# Our vectorized labels
y_train_social = np.asarray(y_train_social)
y_train_agency = np.asarray(y_train_agency)

y_test_social = np.asarray(y_test_social)
y_test_agency  = np.asarray(y_test_agency )

y_test_social.shape,y_train_agency.shape,y_test_social.shape,y_train_agency.shape

((17215,), (9921,), (17215,), (9921,))

In [50]:
def transformResults(predictions):
    
    predictions = predictions[:, 0]
    print(f'First 5 values before conversion : {predictions[:5]}')
    
    predictions = [1 if val>0.5 else 0 for val in predictions]
    print(f'First 5 values after the conversion : {predictions[:5]}')
    return predictions

In [51]:
# craeting lists to store results
models =[]
social_accuracy=[]
agency_acuracy=[]

In [52]:
def getResults(y_test, model_prediction,name,temp):
    model_prediction = transformResults(model_prediction)
    print("\n========== RESULTS ===========\n")
    accuracy = accuracy_score(y_test, model_prediction)
    precision = precision_score(y_test, model_prediction)
    f1 = f1_score(y_test, model_prediction)
    recall = recall_score(y_test, model_prediction)
    
    if(temp==1):
        social_accuracy.append(accuracy)
        models.append(name)
    else:
        agency_acuracy.append(accuracy)
    
    print(f'Accuracy : {accuracy}\nPrecision : {precision}\nF1_score : {f1}\nRecall : {recall}\n')
    
    print("===== CONFUSION MATRIX =====")
    cf_matrix = confusion_matrix(y_test_social, model_prediction)
    print(cf_matrix)
    
    return accuracy, precision, f1, recall

### Sequential model

###### Keras Model
Keras accepts two types of models. There are two APIs: the Sequential model API and the functional API, which can do everything the Sequential model can do but can also be used for advanced models with complex network architectures. The Sequential model is a linear stack of layers in which you can use any of the Keras layers. The Dense layer is the most common, and it is your standard densely connected neural network layer with all of the weights and biases that you are already familiar with. We need to know the input dimension of our feature vectors before we can build our model. This occurs only in the first layer, as the subsequent layers can perform automatic shape inference. You can build the Sequential model by adding layers one by one.

A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.

In [53]:
input_dimention = X_train.shape[1]  # Number of features

Kerasmodel = Sequential()
Kerasmodel.add(layers.Dense(10, input_dim=input_dimention, activation='relu'))
Kerasmodel.add(layers.Dense(1, activation='sigmoid'))

In [54]:
Kerasmodel.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
Kerasmodel.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 10)                72470     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 72,481
Trainable params: 72,481
Non-trainable params: 0
_________________________________________________________________


###### Model fit for  target Label "social" 

In [55]:
history = Kerasmodel.fit(X_train, y_train_social,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
score = Kerasmodel.evaluate(X_test, y_test_social, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.4320727288722992
Test Accuracy: 0.8777810335159302


In [57]:
predictions = Kerasmodel.predict(X_test)
print(predictions[:,0])
name ="Sequential model"
social_results = getResults(y_test_social, predictions,name,1)

[0.99709654 0.30441827 0.9371798  ... 0.99876904 0.01901272 0.11965746]
First 5 values before conversion : [0.99709654 0.30441827 0.9371798  0.9658755  0.62366444]
First 5 values after the conversion : [1, 0, 1, 1, 1]


Accuracy : 0.8777810049375545
Precision : 0.8963527714815578
F1_score : 0.8921246923707958
Recall : 0.8879363135333742

===== CONFUSION MATRIX =====
[[6411 1006]
 [1098 8700]]


###### Model fit for target Label "agency"

In [58]:
history = Kerasmodel.fit(X_train, y_train_agency,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
score = Kerasmodel.evaluate(X_test, y_test_agency, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.5739454627037048
Test Accuracy: 0.7564914226531982


In [60]:
predictions = Kerasmodel.predict(X_test)
print(predictions[:,0])
name ="Sequential model"
agency_results = getResults(y_test_agency, predictions,name,0)

[0.9953363  0.9348954  0.9443674  ... 0.5293682  0.59180975 0.99659425]
First 5 values before conversion : [0.9953363 0.9348954 0.9443674 0.99998   0.2606855]
First 5 values after the conversion : [1, 1, 1, 1, 0]


Accuracy : 0.7564914318907929
Precision : 0.8194288464623777
F1_score : 0.8297457558281212
Recall : 0.8403257650542941

===== CONFUSION MATRIX =====
[[1118 6299]
 [3631 6167]]


###### Word Embedding
Unlike one-hot encoding, which is hardcoded, this method represents words as dense word vectors (also known as word embeddings) that are trained. This means that word embeddings store more information in fewer dimensions.

In [61]:
MAX_SEQUENCE_LENGTH=500
EMBEDDING_DIM=300 
dropout=0.5

def loadData_Tokenizer(X_train, X_test):
    
    tokenizer = Tokenizer(num_words=5000, lower = False)

    X_train = train_df['moment'].values
    X_test = test_df['moment'].values

    tokenizer.fit_on_texts(X_train)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    
    vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
    
    word_inndex =tokenizer.word_index

    
    return (X_train, X_test,vocab_size,word_inndex )

X_train,X_test, vocab_size,word_inndex= loadData_Tokenizer(X_train,X_test)

In [62]:
maxlen = 25


X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

X_train.shape, X_test.shape

((9921, 25), (17215, 25))

### Sequential model + Keras Embedding Layer

###### Keras Embedding Layer
Now you can use Keras Embedding Layer to map the previously calculated integers to a dense vector of the embedding. The following parameters are required: input_dim: the size of the vocabulary output_dim: the size of the dense vector input_length: the length of the sequence We now have a couple of options with the Embedding layer. One approach would be to plug the embedding layers output into a Dense layer. To accomplish this, insert a Flatten layer between them that prepares the sequential input for the Dense layer.

In [63]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 50)            363150    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1250)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 10)                12510     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 11        
Total params: 375,671
Trainable params: 375,671
Non-trainable params: 0
_________________________________________________________________


###### Model fit for target label "social"

In [64]:
history = model.fit(X_train, y_train_social,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
score = model.evaluate(X_test, y_test_social, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = model.predict(X_test)
print(predictions[:,0])
name="Sequentialmodel + KerasEmbeddingLayer"
social_results = getResults(y_test_social, predictions,name,1)

Test Score: 0.6495831608772278
Test Accuracy: 0.8615742325782776
[9.9997830e-01 4.2547777e-01 8.0466348e-01 ... 1.0000000e+00 1.7039120e-02
 9.0593100e-04]
First 5 values before conversion : [0.9999783  0.42547777 0.8046635  0.9960786  0.99927855]
First 5 values after the conversion : [1, 0, 1, 1, 1]


Accuracy : 0.8615742085390647
Precision : 0.8804515135967163
F1_score : 0.8780637568438827
Recall : 0.8756889161053276

===== CONFUSION MATRIX =====
[[6252 1165]
 [1218 8580]]


###### Model fit for target label "agency"

In [66]:
history = model.fit(X_train, y_train_agency,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [67]:
score = model.evaluate(X_test, y_test_agency, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = model.predict(X_test)
print(predictions[:,0])
agency_results = getResults(y_test_agency, predictions,name,0)

Test Score: 0.7709935307502747
Test Accuracy: 0.7688062787055969
[0.99999017 0.9999995  0.99992967 ... 0.0659101  0.9793694  0.9999591 ]
First 5 values before conversion : [0.99999017 0.9999995  0.99992967 0.99971825 0.53618723]
First 5 values after the conversion : [1, 1, 1, 1, 1]


Accuracy : 0.7688062735986059
Precision : 0.8446880269814503
F1_score : 0.8342771485676215
Recall : 0.8241197762421849

===== CONFUSION MATRIX =====
[[1382 6035]
 [3973 5825]]


### Sequential model + Keras Embedding Layer + GlobalMaxPooling1D layer + CNN

###### Convolutional Neural Networks (CNN)
Convolutional layers are the hidden layers in a CNN. These convolutional layers can detect edges, corners, and other types of textures, making them a unique tool. The convolutional layer is made up of multiple filters that are slid across the image to detect specific features.

In [68]:
embedding_dim = 100

cnn_model = Sequential()
cnn_model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
cnn_model.add(layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(layers.GlobalMaxPooling1D())
cnn_model.add(layers.Dense(10, activation='relu'))
cnn_model.add(layers.Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
cnn_model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 25, 100)           726300    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 21, 128)           64128     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 11        
Total params: 791,729
Trainable params: 791,729
Non-trainable params: 0
_________________________________________________________________


###### Model fit for label "social"

In [69]:
history = cnn_model.fit(X_train, y_train_social,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [70]:
score = cnn_model.evaluate(X_test, y_test_social, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = cnn_model.predict(X_test)
print(predictions[:,0])
name='Sequential model + Keras Embedding Layer + CNN'
social_results = getResults(y_test_social, predictions,name,1)

Test Score: 0.7287238240242004
Test Accuracy: 0.8812082409858704
[1.0000000e+00 6.5449476e-03 7.0615321e-02 ... 9.9999976e-01 3.3387840e-03
 1.6718799e-07]
First 5 values before conversion : [1.         0.00654495 0.07061532 0.99993676 0.22685966]
First 5 values after the conversion : [1, 0, 0, 1, 0]


Accuracy : 0.8812082486203892
Precision : 0.8833943230145386
F1_score : 0.8972826359937716
Recall : 0.9116146152275975

===== CONFUSION MATRIX =====
[[6238 1179]
 [ 866 8932]]


###### Model fit for label "agency"

In [71]:
history = cnn_model.fit(X_train, y_train_agency,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [72]:
score = cnn_model.evaluate(X_test, y_test_agency, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = cnn_model.predict(X_test)
print(predictions[:,0])
agency_results = getResults(y_test_agency, predictions,name,0)

Test Score: 1.3628473281860352
Test Accuracy: 0.78530353307724
[1.         1.         1.         ... 0.9999978  0.99999917 1.        ]
First 5 values before conversion : [1.         1.         1.         1.         0.06686971]
First 5 values after the conversion : [1, 1, 1, 1, 0]


Accuracy : 0.7853035143769969
Precision : 0.8221139201949437
F1_score : 0.853843720341664
Recall : 0.8881210924646266

===== CONFUSION MATRIX =====
[[ 902 6515]
 [3181 6617]]


### Sequential model + Glove Embedding Layer + GlobalMaxPooling1D layer + CNN

In [73]:
X_train = train_df['moment'].values
X_test = test_df['moment'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 25

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [74]:
embeddings_dictionary = dict()

glove_file = open('glove.6B.300d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = np.zeros((vocab_size, 300))
for word, index in word_inndex.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [75]:
embedding_dim = 100

cnn_g_model = Sequential()
cnn_g_model.add(layers.Embedding(vocab_size,
                            300,
                            weights=[embedding_matrix],
                            trainable=False))
cnn_g_model.add(layers.Conv1D(128, 5, activation='relu'))
cnn_g_model.add(layers.GlobalMaxPooling1D())
cnn_g_model.add(layers.Dense(10, activation='relu'))
cnn_g_model.add(layers.Dense(1, activation='sigmoid'))
cnn_g_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
cnn_g_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         2178900   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         192128    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 2,372,329
Trainable params: 193,429
Non-trainable params: 2,178,900
_________________________________________________________________


In [76]:
history = cnn_g_model.fit(X_train, y_train_social,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [77]:
score = cnn_g_model.evaluate(X_test, y_test_social, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = cnn_g_model.predict(X_test)
print(predictions[:,0])
name = 'Sequential model + Glove Embedding Layer + CNN'
social_results = getResults(y_test_social, predictions,name,1)

Test Score: 0.772632360458374
Test Accuracy: 0.8867266774177551
[9.9999762e-01 1.7213821e-04 9.8169858e-07 ... 9.9997830e-01 4.2708934e-06
 1.7663101e-09]
First 5 values before conversion : [9.9999762e-01 1.7213821e-04 9.8169858e-07 3.0062079e-02 4.5708328e-02]
First 5 values after the conversion : [1, 0, 0, 0, 0]


Accuracy : 0.8867266918385129
Precision : 0.9551148225469729
F1_score : 0.8941368078175896
Recall : 0.8404776484996939

===== CONFUSION MATRIX =====
[[7030  387]
 [1563 8235]]


In [78]:
history = cnn_g_model.fit(X_train, y_train_agency,
                    epochs=10,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
score = cnn_g_model.evaluate(X_test, y_test_agency, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = cnn_g_model.predict(X_test)
print(predictions[:,0])
social_acency = getResults(y_test_agency, predictions,name,0)

Test Score: 1.011755347251892
Test Accuracy: 0.7965146899223328
[1.         1.         1.         ... 0.999987   0.85839355 1.        ]
First 5 values before conversion : [1.         1.         1.         0.9999967  0.02557856]
First 5 values after the conversion : [1, 1, 1, 1, 0]


Accuracy : 0.796514667441185
Precision : 0.8546602180506598
F1_score : 0.8561691644426195
Recall : 0.857683448502797

===== CONFUSION MATRIX =====
[[1111 6306]
 [3905 5893]]


In [80]:
df = pd.DataFrame(list(zip(models, social_accuracy,agency_acuracy)), 
               columns =['model', 'social_accuracy','agency_acuracy']) 

In [81]:
df

Unnamed: 0,model,social_accuracy,agency_acuracy
0,Sequential model,0.877781,0.756491
1,Sequentialmodel + KerasEmbeddingLayer,0.861574,0.768806
2,Sequential model + Keras Embedding Layer + CNN,0.881208,0.785304
3,Sequential model + Glove Embedding Layer + CNN,0.886727,0.796515
