In [1]:
!wget -q --show-progress https://nlp.stanford.edu/data/glove.42B.300d.zip




In [10]:
!mkdir data

In [11]:
!mv /content/train.csv /content/data


In [3]:
!unzip /content/glove.42B.300d.zip

Archive:  /content/glove.42B.300d.zip
  inflating: glove.42B.300d.txt      


In [4]:
!rm /content/glove.42B.300d.zip

In [5]:
import numpy as np
import pandas as pd
# import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Activation, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

In [6]:
glove_path = "/content/glove.42B.300d.txt"

In [7]:
embeddings_index = {}
f = open(glove_path, encoding = "utf-8")

for line in f:
  line = line.strip()
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype = 'float32')
  embeddings_index[word] = coefs

f.close()

In [12]:
dataset = pd.read_csv(r"/content/data/train.csv")
dataset.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [13]:
dataset["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,1225312
1,80810


In [14]:
dataset.isnull().sum()

Unnamed: 0,0
qid,0
question_text,0
target,0


In [15]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils.class_weight import compute_class_weight

In [16]:
X=dataset['question_text']
Y=dataset['target']
X.shape, Y.shape, dataset.shape

((1306122,), (1306122,), (1306122, 3))

In [17]:
dataset.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [18]:
dataset.dtypes

Unnamed: 0,0
qid,object
question_text,object
target,int64


In [19]:
x_train,x_test,y_train,y_test=train_test_split(X, Y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1044897,), (261225,), (1044897,), (261225,))

In [20]:
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: np.float64(class_weights[i]) for i in range(len(class_weights))}

In [21]:
class_weight_dict

{0: np.float64(0.5331417235492268), 1: np.float64(8.04336145579949)}

In [22]:
class_weights

array([0.53314172, 8.04336146])

In [23]:
for each in enumerate(class_weights):
  print(each)

(0, np.float64(0.5331417235492268))
(1, np.float64(8.04336145579949))


In [24]:
class_weight_dict1 = dict(enumerate(class_weights))
class_weight_dict1

{0: np.float64(0.5331417235492268), 1: np.float64(8.04336145579949)}

In [25]:
x_train.head(5)

Unnamed: 0,question_text
298773,How is strategic positioning is different from...
815475,What is the best way for promote Facebook mark...
1133453,How much energized proton radiation does the I...
1076426,Would any Indian men want to marry a women tha...
203792,Which is the best business for startups in Ind...


In [26]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [27]:
from nltk import word_tokenize

In [28]:
i=0
for x in x_train:
  print(len(word_tokenize(x)))
  print(word_tokenize(x))
  i+=1
  if i==5:
    break

10
['How', 'is', 'strategic', 'positioning', 'is', 'different', 'from', 'marketing', 'positioning', '?']
10
['What', 'is', 'the', 'best', 'way', 'for', 'promote', 'Facebook', 'marketing', '?']
27
['How', 'much', 'energized', 'proton', 'radiation', 'does', 'the', 'ISS', 'experience', '?', 'If', 'it', 'is', 'harmful', 'levels', 'to', 'the', 'astronauts', ',', 'what', 'kind', 'of', 'shielding', 'does', 'it', 'use', '?']
15
['Would', 'any', 'Indian', 'men', 'want', 'to', 'marry', 'a', 'women', 'that', 'does', "n't", 'want', 'children', '?']
13
['Which', 'is', 'the', 'best', 'business', 'for', 'startups', 'in', 'India', 'with', 'sure', 'profit', '?']


In [29]:
lengths=[len(word_tokenize(x)) for x in x_train]
print(max(lengths))

412


In [30]:
np.percentile(lengths,95)

np.float64(31.0)

In [31]:
max_len=35

tokenizer=Tokenizer(char_level=False, split=' ', oov_token="<OOV>")

tokenizer.fit_on_texts(x_train)
vocab_size=len(tokenizer.word_index)

x_train_seq= pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=max_len)
x_test_seq= pad_sequences(tokenizer.texts_to_sequences(x_test),maxlen=max_len)

In [32]:
vocab_size, tokenizer.word_index

(196193,
 {'<OOV>': 1,
  'the': 2,
  'what': 3,
  'is': 4,
  'a': 5,
  'to': 6,
  'in': 7,
  'of': 8,
  'i': 9,
  'how': 10,
  'and': 11,
  'do': 12,
  'are': 13,
  'for': 14,
  'you': 15,
  'can': 16,
  'why': 17,
  'it': 18,
  'my': 19,
  'that': 20,
  'if': 21,
  'with': 22,
  'or': 23,
  'on': 24,
  'have': 25,
  'be': 26,
  'does': 27,
  'from': 28,
  'your': 29,
  'an': 30,
  'which': 31,
  'should': 32,
  'get': 33,
  'when': 34,
  'best': 35,
  'would': 36,
  'as': 37,
  'people': 38,
  'some': 39,
  'there': 40,
  'who': 41,
  'will': 42,
  'like': 43,
  'not': 44,
  'at': 45,
  'about': 46,
  'by': 47,
  'they': 48,
  'did': 49,
  'was': 50,
  'any': 51,
  'we': 52,
  'so': 53,
  'good': 54,
  'me': 55,
  'their': 56,
  'one': 57,
  'india': 58,
  'has': 59,
  'after': 60,
  'most': 61,
  'where': 62,
  'make': 63,
  'this': 64,
  'but': 65,
  'more': 66,
  'all': 67,
  'think': 68,
  'many': 69,
  'between': 70,
  'than': 71,
  'time': 72,
  'much': 73,
  'other': 74,
  'lif

In [33]:
x_train_seq.shape, x_train.shape

((1044897, 35), (1044897,))

In [34]:
x_test_seq

array([[   0,    0,    0, ...,   54,  259, 1229],
       [   0,    0,    0, ...,  219,   28, 2009],
       [   0,    0,    0, ...,    5,  146, 3083],
       ...,
       [   0,    0,    0, ..., 5453,  101, 1908],
       [   0,    0,    0, ...,  822, 1459, 4634],
       [   0,    0,    0, ...,    2, 4994,  490]], dtype=int32)

In [35]:
x_test_seq.shape, x_test.shape

((261225, 35), (261225,))

In [36]:
tokenizer.word_index

{'<OOV>': 1,
 'the': 2,
 'what': 3,
 'is': 4,
 'a': 5,
 'to': 6,
 'in': 7,
 'of': 8,
 'i': 9,
 'how': 10,
 'and': 11,
 'do': 12,
 'are': 13,
 'for': 14,
 'you': 15,
 'can': 16,
 'why': 17,
 'it': 18,
 'my': 19,
 'that': 20,
 'if': 21,
 'with': 22,
 'or': 23,
 'on': 24,
 'have': 25,
 'be': 26,
 'does': 27,
 'from': 28,
 'your': 29,
 'an': 30,
 'which': 31,
 'should': 32,
 'get': 33,
 'when': 34,
 'best': 35,
 'would': 36,
 'as': 37,
 'people': 38,
 'some': 39,
 'there': 40,
 'who': 41,
 'will': 42,
 'like': 43,
 'not': 44,
 'at': 45,
 'about': 46,
 'by': 47,
 'they': 48,
 'did': 49,
 'was': 50,
 'any': 51,
 'we': 52,
 'so': 53,
 'good': 54,
 'me': 55,
 'their': 56,
 'one': 57,
 'india': 58,
 'has': 59,
 'after': 60,
 'most': 61,
 'where': 62,
 'make': 63,
 'this': 64,
 'but': 65,
 'more': 66,
 'all': 67,
 'think': 68,
 'many': 69,
 'between': 70,
 'than': 71,
 'time': 72,
 'much': 73,
 'other': 74,
 'life': 75,
 'someone': 76,
 'use': 77,
 'he': 78,
 'out': 79,
 'way': 80,
 'am': 81,
 '

In [37]:
embedding_dim = 300
embedding_matrix=np.zeros((vocab_size+1,embedding_dim))

In [38]:
embedding_matrix.shape, vocab_size

((196194, 300), 196193)

In [39]:
for word,i in tokenizer.word_index.items():
    embed_vector=embeddings_index.get(word)
    if embed_vector is not None:
        embedding_matrix[i]=embed_vector

In [40]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.20838   , -0.14932001, -0.017528  , ..., -0.54066002,
         0.21199   , -0.0094357 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.017604  , -0.24662   , -0.50321001, ..., -0.46059   ,
        -0.098323  , -0.50858998]])

In [41]:
embedding_matrix.shape

(196194, 300)

In [42]:
# Model


inputs = Input(shape=[max_len], name = 'inputs')
embeddings=Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, input_length=max_len, mask_zero=True,
                weights=[embedding_matrix], trainable=False, name = "embeddings")(inputs)

lstm1 = LSTM(512, use_cudnn=False, name="lstm1")(embeddings)
drop1 = Dropout(0.2, name="drop1")(lstm1)

dense1 = Dense(256, activation='relu', name = "dense1")(drop1)
drop2 = Dropout(0.2, name = "drop2")(dense1)

dense2 = Dense(128, activation='relu', name = "dense2")(drop2)
drop3 = Dropout(0.2, name = "drop3")(dense2)

dense3 = Dense(32, activation='relu', name = "dense3")(drop3)
drop4 = Dropout(0.2, name = "drop4")(dense3)

output_layer = Dense(1, activation='sigmoid', name = "output_layer")(drop4)

model = Model(inputs=inputs,outputs=output_layer)



In [43]:
model.summary()

In [44]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [45]:
filepath='/content/weights-{epoch:02d}-{val_loss:.4f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
                             save_best_only=True)

In [46]:
earlystop = EarlyStopping(monitor='val_loss', patience=3,
                          verbose=1)

In [47]:
class_weight_dict

{0: np.float64(0.5331417235492268), 1: np.float64(8.04336145579949)}

In [48]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [49]:
model.fit(x_train_seq, y_train,
              epochs=10,
              class_weight={0: 0.5331417235492268, 1: 8.04336145579949},
              batch_size=1000,
              validation_data=(x_test_seq, y_test),
              callbacks = [earlystop, checkpoint])

Epoch 1/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.8511 - loss: 0.3332
Epoch 1: val_loss improved from inf to 0.24919, saving model to /content/weights-01-0.2492.h5




[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 168ms/step - accuracy: 0.8511 - loss: 0.3332 - val_accuracy: 0.8905 - val_loss: 0.2492
Epoch 2/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.8918 - loss: 0.2557
Epoch 2: val_loss improved from 0.24919 to 0.23380, saving model to /content/weights-02-0.2338.h5




[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 156ms/step - accuracy: 0.8918 - loss: 0.2557 - val_accuracy: 0.9020 - val_loss: 0.2338
Epoch 3/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.9002 - loss: 0.2322
Epoch 3: val_loss did not improve from 0.23380
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 161ms/step - accuracy: 0.9002 - loss: 0.2322 - val_accuracy: 0.8972 - val_loss: 0.2510
Epoch 4/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.9051 - loss: 0.2093
Epoch 4: val_loss did not improve from 0.23380
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 154ms/step - accuracy: 0.9051 - loss: 0.2093 - val_accuracy: 0.9021 - val_loss: 0.2587
Epoch 5/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.9107



[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 159ms/step - accuracy: 0.9107 - loss: 0.1866 - val_accuracy: 0.9197 - val_loss: 0.2019
Epoch 6/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.9188 - loss: 0.1654
Epoch 6: val_loss did not improve from 0.20189
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 154ms/step - accuracy: 0.9188 - loss: 0.1654 - val_accuracy: 0.9103 - val_loss: 0.2126
Epoch 7/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.9259 - loss: 0.1467
Epoch 7: val_loss did not improve from 0.20189
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 161ms/step - accuracy: 0.9259 - loss: 0.1467 - val_accuracy: 0.9175 - val_loss: 0.2039
Epoch 8/10
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.9327

<keras.src.callbacks.history.History at 0x7a8bd5bd2b50>

In [50]:
y_pred_prob = model.predict(x_test_seq)
score = roc_auc_score(y_test, y_pred_prob)

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step


In [51]:
score

np.float64(0.9626282282002923)

In [52]:
model.save('spam_filter_for_quora_questions_model.h5')



In [53]:
best_cutoff = 0.5
cutoff_values = np.arange(0.1, 0.9, 0.05)
best_f1 = 0
for cutoff in cutoff_values:
    y_pred = (y_pred_prob > cutoff).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_cutoff = cutoff

In [54]:
y_pred = (y_pred_prob > best_cutoff).astype(int)

In [56]:
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
roc_auc = roc_auc_score(y_test, y_pred_prob)


In [57]:

print(f"Best Cutoff: {best_cutoff:.2f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Best Cutoff: 0.85
Accuracy: 0.9471
Precision: 0.5433
Recall: 0.8018
F1 Score: 0.6477
ROC AUC Score: 0.9626
