In [None]:
import pandas as pd 
import numpy as np

from tensorflow.python.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.python.keras.layers import Conv2D, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Input
from tensorflow.keras.models import load_model

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import tensorflow as tf
tf.config.run_functions_eagerly(True)

In [None]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/project'

/content/drive/MyDrive/project


In [None]:
import pickle

with open('df_train.pkl', 'rb') as f:
    df_train = pickle.load(f)

with open('df_validation.pkl', 'rb') as f:
    df_validation = pickle.load(f)

In [None]:
from sklearn.utils import shuffle

def xy_data(data):
  
  data.body = data.body + data.clean_title
  data.drop(columns=["clean_title"], axis = 1, inplace=True)


  data = shuffle(data)

  data.reset_index(inplace=True, drop=True)

  x_train_features = data['body']
  
  y_train = data['2_way_label']
  
  display(x_train_features.head())
  display(y_train.head())

  return x_train_features, y_train

In [None]:
x_train, y_train = xy_data(df_train)

0    deleted userspacecowboyPlease post comments ph...
1    What robocall I’ve never heard term Does count...
2    Mixed drink belongs shower Am I shit got real ...
3    Clearly Arkenshoes returned erf capable hands ...
4    Is satire site Seems written Greetings NEARBEE...
Name: body, dtype: object

0    1
1    1
2    0
3    1
4    1
Name: 2_way_label, dtype: int64

In [None]:
x_validation, y_validation = xy_data(df_validation)

0    This poster frequently shopped like many other...
1    Thats canceltheonionasian guy separate group a...
2                    Oooo piece candynew gummy candies
3    Removed Previously submitted articles articles...
4    Thank submitting post rfakealbumcovers Please ...
Name: body, dtype: object

0    0
1    0
2    0
3    1
4    0
Name: 2_way_label, dtype: int64

Using the CountVectorizer provided by the scikit-learn library to vectorize sentences.

In [None]:
#Using CountVectorizer for feature extraction

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer( stop_words='english', lowercase=True, token_pattern=r'\w{1,}', min_df=10)
vectorizer.fit(x_train)

CountVectorizer(min_df=10, stop_words='english', token_pattern='\\w{1,}')

In [None]:
vectorizer.vocabulary_


{'ginger': 7807,
 'thank': 18904,
 'posting': 14100,
 'rfakealbumcovers': 15773,
 'post': 14088,
 'removed': 15427,
 'image': 9095,
 'square': 17778,
 'familiarize': 6704,
 'subreddits': 18238,
 'rules': 16157,
 'wish': 20871,
 'resubmit': 15665,
 'fixing': 7069,
 'issueif': 9787,
 'believe': 1627,
 'error': 6255,
 'reviewed': 15738,
 'reply': 15502,
 'notre': 12535,
 'dame': 4498,
 'black': 1804,
 'metal': 11486,
 'looking': 10847,
 'moments': 11838,
 'suicide': 18319,
 'bunker': 2370,
 'username': 20038,
 'checks': 2940,
 'need': 12248,
 'know': 10234,
 'guess': 8170,
 'leonardo': 10556,
 'dicaprio': 5058,
 'madonna': 11002,
 'chilling': 3012,
 'hungarian': 8938,
 'traffic': 19346,
 'hitler': 8687,
 'eva': 6347,
 'braun': 2142,
 'nsfw': 12559,
 'mean': 11324,
 'belongs': 1644,
 'male': 11066,
 'n': 12135,
 'f': 6619,
 'w': 20446,
 'lets': 10568,
 'unit': 19867,
 'magic': 11012,
 'moment': 11836,
 'little': 10757,
 'girl': 7811,
 'right': 15840,
 'thats': 18923,
 'perfect': 13507,
 'e

In [None]:
X_train = vectorizer.fit_transform(x_train).toarray()

display(X_train)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_validation = vectorizer.transform(x_validation).toarray()

display(X_validation)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
def transformResults(predictions):
    
    predictions = predictions[:, 0]
    print(f'First 5 values before conversion : {predictions[:5]}')
    
    predictions = [1 if val>0.5 else 0 for val in predictions]
    print(f'First 5 values after the conversion : {predictions[:5]}')
    return predictions

In [None]:
# craeting lists to store results
models =[]
accuracy=[]
f1_scores=[]

In [None]:
def getResults(y_test, model_prediction,name):
    model_prediction = transformResults(model_prediction)

    print("\n========== RESULTS ===========\n")
    acc = accuracy_score(y_test, model_prediction)
    precision = precision_score(y_test, model_prediction)
    f1 = f1_score(y_test, model_prediction)
    recall = recall_score(y_test, model_prediction)
    
    accuracy.append(acc)
    models.append(name)
    f1_scores.append(f1)
    
    print(f'Accuracy : {acc}\nPrecision : {precision}\nF1_score : {f1}\nRecall : {recall}\n')
    
    print("===== CONFUSION MATRIX =====")
    cf_matrix = confusion_matrix(y_test, model_prediction)
    print(cf_matrix)
    
    return acc, precision, f1, recall

# **Sequential model Keras Model**

Keras accepts two types of models. There are two APIs: the Sequential model API and the functional API, which can do everything the Sequential model can do but can also be used for advanced models with complex network architectures. The Sequential model is a linear stack of layers in which you can use any of the Keras layers. The Dense layer is the most common, and it is your standard densely connected neural network layer with all of the weights and biases that you are already familiar with. We need to know the input dimension of our feature vectors before we can build our model. This occurs only in the first layer, as the subsequent layers can perform automatic shape inference. You can build the Sequential model by adding layers one by one.

A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.

In [None]:
input_dimention = X_train.shape[1]  # Number of features

Kerasmodel = Sequential()
Kerasmodel.add(layers.Dense(10, input_dim=input_dimention, activation='relu'))
Kerasmodel.add(layers.Dense(1, activation='sigmoid'))
Kerasmodel.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])


In [None]:
input_shape = X_train.shape  
Kerasmodel.build(input_shape)

Kerasmodel.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper (ModuleWrapp  (55890, 10)              212280    
 er)                                                             
                                                                 
 module_wrapper_1 (ModuleWra  (55890, 1)               11        
 pper)                                                           
                                                                 
Total params: 212,291
Trainable params: 212,291
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Model fit 
history = Kerasmodel.fit(X_train, y_train,
                    epochs=1,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)





In [None]:
score = Kerasmodel.evaluate(X_validation, y_validation, verbose=1)
print("Validation Score:", score[0])
print("Validation Accuracy:", score[1])

Validation Score: 0.3182376027107239
Validation Accuracy: 0.8717331290245056


In [None]:
predictions = Kerasmodel.predict(X_validation)
print(predictions[:,0])

name ="Sequential model"
results = getResults(y_validation, predictions,name)

[0.44661272 0.9999867  0.04047155 ... 0.97558784 0.03917297 0.9999996 ]
First 5 values before conversion : [0.44661272 0.9999867  0.04047155 0.95581454 0.04525505]
First 5 values after the conversion : [0, 1, 0, 1, 0]


Accuracy : 0.8717331499312242
Precision : 0.8613744075829384
F1_score : 0.7958401751505199
Recall : 0.7395727365208545

===== CONFUSION MATRIX =====
[[1808  117]
 [ 256  727]]


# Word Embedding

Unlike one-hot encoding, which is hardcoded, this method represents words as dense word vectors (also known as word embeddings) that are trained. This means that word embeddings store more information in fewer dimensions.

It is an advance over the more conventional bag-of-word encoding techniques, in which each word was represented by a big sparse vector, or a complete vocabulary was represented by scoring each word within the vector. Due to the enormous vocabularies and the fact that most words and documents were represented by large vectors largely made up of zero values, these representations were sparse. 

In contrast, words are represented in an embedding by dense vectors, where a vector is the word's projection into a continuous vector space. 

Words that surround a word when it is used determine its position inside the vector space, which is learned through text.

In [None]:
MAX_SEQUENCE_LENGTH=500
EMBEDDING_DIM=300 
dropout=0.5

def loadData_Tokenizer(Xtrain, Xvalidation):
    
    tokenizer = Tokenizer(num_words=5000, lower = False)

    tokenizer.fit_on_texts(Xtrain)

    train = tokenizer.texts_to_sequences(Xtrain)
    validation = tokenizer.texts_to_sequences(Xvalidation)
    
    vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
    
    word_inndex =tokenizer.word_index

    
    return (train, validation,vocab_size,word_inndex )

X_train_t,X_validation_t, vocab_size,word_inndex= loadData_Tokenizer(x_train,x_validation)

In [None]:
maxlen = 300


X_train_t = pad_sequences(X_train_t, padding='post', maxlen=maxlen)
X_validation_t = pad_sequences(X_validation_t, padding='post', maxlen=maxlen)

X_train_t.shape, X_validation_t.shape

((55890, 300), (2908, 300))

## Sequential model + Keras Embedding Layer Keras Embedding Layer **bold text** bold text

Now you can use Keras Embedding Layer to map the previously calculated integers to a dense vector of the embedding. The following parameters are required: input_dim: the size of the vocabulary output_dim: the size of the dense vector input_length: the length of the sequence We now have a couple of options with the Embedding layer. One approach would be to plug the embedding layers output into a Dense layer. To accomplish this, insert a Flatten layer between them that prepares the sequential input for the Dense layer.

In [None]:
embedding_dim = 50
input_shape = X_train_t.shape 


model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.build(input_shape)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper (ModuleWrapp  (55890, 300, 50)         14299800  
 er)                                                             
                                                                 
 module_wrapper_1 (ModuleWra  (55890, 15000)           0         
 pper)                                                           
                                                                 
 module_wrapper_2 (ModuleWra  (55890, 10)              150010    
 pper)                                                           
                                                                 
 module_wrapper_3 (ModuleWra  (55890, 1)               11        
 pper)                                                           
                                                                 
Total params: 14,449,821
Trainable params: 14,449,821
No

In [None]:
#model fit
history = model.fit(X_train_t, y_train,
                    epochs=5,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


The result of the model shows high overfitting over epochs. Just using one epoch for model fit in the next step.

In [None]:
score = model.evaluate(X_validation_t, y_validation, verbose=1)
print("Validation Score:", score[0])
print("Validation Accuracy:", score[1])

predictions = model.predict(X_validation_t)
print(predictions[:,0])

name="Sequentialmodel + KerasEmbeddingLayer"
results = getResults(y_validation, predictions,name)

Validation Score: 0.7501960396766663
Validation Accuracy: 0.8094910383224487
[9.9969923e-01 9.9983799e-01 6.6398137e-04 ... 9.1773475e-04 2.0504526e-03
 9.9981153e-01]
First 5 values before conversion : [9.9969923e-01 9.9983799e-01 6.6398137e-04 1.0312310e-02 9.9992412e-01]
First 5 values after the conversion : [1, 1, 0, 0, 1]


Accuracy : 0.8094910591471802
Precision : 0.7092682926829268
F1_score : 0.7241035856573705
Recall : 0.7395727365208545

===== CONFUSION MATRIX =====
[[1627  298]
 [ 256  727]]


## **Sequential model + Keras Embedding Layer + GlobalMaxPooling1D layer + CNN**
Convolutional Neural Networks (CNN)

Convolutional layers are the hidden layers in a CNN. These convolutional layers can detect edges, corners, and other types of textures, making them a unique tool. The convolutional layer is made up of multiple filters that are slid across the image to detect specific features.

In [None]:
embedding_dim = 100
input_shape = X_train_t.shape 

cnn_model = Sequential()
cnn_model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
cnn_model.add(layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(layers.GlobalMaxPooling1D())
cnn_model.add(layers.Dense(10, activation='relu'))
cnn_model.add(layers.Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
cnn_model.build(input_shape)
cnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper_5 (ModuleWra  (55890, 300, 100)        28599600  
 pper)                                                           
                                                                 
 module_wrapper_6 (ModuleWra  (55890, 296, 128)        64128     
 pper)                                                           
                                                                 
 module_wrapper_7 (ModuleWra  (55890, 128)             0         
 pper)                                                           
                                                                 
 module_wrapper_8 (ModuleWra  (55890, 10)              1290      
 pper)                                                           
                                                                 
 module_wrapper_9 (ModuleWra  (55890, 1)              

In [None]:
history = cnn_model.fit(X_train_t, y_train,
                    epochs=5,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
score = cnn_model.evaluate(X_validation_t, y_validation, verbose=1)
print("Validation Score:", score[0])
print("Validation Accuracy:", score[1])

predictions = cnn_model.predict(X_validation_t)
print(predictions[:,0])

name="Sequential model + Keras Embedding Layer + CNN"
results = getResults(y_validation, predictions,name)

Validation Score: 0.6799298524856567
Validation Accuracy: 0.8603851199150085
[3.1274660e-06 1.9786984e-01 1.2681275e-07 ... 1.3303191e-05 2.7337816e-02
 2.4430705e-03]
First 5 values before conversion : [3.1274660e-06 1.9786984e-01 1.2681275e-07 3.0185636e-02 1.6995553e-08]
First 5 values after the conversion : [0, 0, 0, 0, 0]


Accuracy : 0.8603851444291609
Precision : 0.8374269005847953
F1_score : 0.7791077257889009
Recall : 0.728382502543235

===== CONFUSION MATRIX =====
[[1786  139]
 [ 267  716]]


# **Sequential model + Glove Embedding Layer + GlobalMaxPooling1D layer + CNN**

word embedding-  

A word embedding learned somewhere else can also be used by the Keras Embedding layer. 

In the field of natural language processing, word embeddings are frequently learned, saved, and made freely available.

Use pretrained GloVe model from Stanford http://nlp.stanford.edu/data/glove.6B.zip

Contains 300-dimensional vectors for 0.4 million words and phrases

In [None]:
embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = np.zeros((vocab_size, 100))
for word, index in word_inndex.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_dim = 100
input_shape = X_train_t.shape

cnn_g_model = Sequential()
cnn_g_model.add(layers.Embedding(vocab_size,
                            100,
                            weights=[embedding_matrix],
                            trainable=False))
cnn_g_model.add(layers.Conv1D(128, 5, activation='relu'))
cnn_g_model.add(layers.GlobalMaxPooling1D())
cnn_g_model.add(layers.Dense(10, activation='relu'))
cnn_g_model.add(layers.Dense(1, activation='sigmoid'))
cnn_g_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
cnn_g_model.build(input_shape)
cnn_g_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper (ModuleWrapp  (55890, 300, 100)        28599600  
 er)                                                             
                                                                 
 module_wrapper_1 (ModuleWra  (55890, 296, 128)        64128     
 pper)                                                           
                                                                 
 module_wrapper_2 (ModuleWra  (55890, 128)             0         
 pper)                                                           
                                                                 
 module_wrapper_3 (ModuleWra  (55890, 10)              1290      
 pper)                                                           
                                                                 
 module_wrapper_4 (ModuleWra  (55890, 1)               1

In [None]:
history = cnn_g_model.fit(X_train_t, y_train,
                    epochs=3,
                    verbose=True,
                    validation_split=0.2,
                    batch_size=10)

Epoch 1/3




Epoch 2/3
Epoch 3/3


In [None]:
score = cnn_g_model.evaluate(X_validation_t, y_validation, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

predictions = cnn_g_model.predict(X_validation_t)
print(predictions[:,0])
name = 'Sequential model + Glove Embedding Layer + CNN'
social_results = getResults(y_validation, predictions,name)


Test Score: 0.36749014258384705
Test Accuracy: 0.8277166485786438
[0.19762613 0.27602813 0.15953471 ... 0.22039804 0.85972637 0.02786459]
First 5 values before conversion : [1.9762613e-01 2.7602813e-01 1.5953471e-01 9.9345404e-01 1.8120518e-05]
First 5 values after the conversion : [0, 0, 0, 1, 0]


Accuracy : 0.8277166437414031
Precision : 0.7190909090909091
F1_score : 0.7594815170427267
Recall : 0.8046795523906409

===== CONFUSION MATRIX =====
[[1616  309]
 [ 192  791]]


In [None]:
df_results = pd.DataFrame(list(zip(models, accuracy, f1_scores)), 
               columns =['model', 'accuracy','f1_score']) 
df_results

Unnamed: 0,model,accuracy,f1_score
0,Sequential model + Glove Embedding Layer + CNN,0.827717,0.759482


In [None]:
s