# Quora Insincere Question Classification

## Imports and Preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import tensorflow.keras as keras
import nltk 
import sklearn 
from tqdm import tqdm 
tqdm.pandas()

In [2]:
train_df = pd.read_csv("train.csv").drop(columns="qid")

In [3]:
train_df.head()

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [4]:
train_df["question_text"] = train_df["question_text"].progress_apply(nltk.word_tokenize)

100%|█████████████████████████| 1306122/1306122 [01:28<00:00, 14785.80it/s]


In [5]:
PAD_LENGTH = 50 

In [6]:
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x:( x[:min(len(x), PAD_LENGTH)] + (PAD_LENGTH-len(x))*['']   ))

100%|████████████████████████| 1306122/1306122 [00:03<00:00, 375675.12it/s]


In [7]:
qt_corpus_list = list(set([item for sublist in train_df["question_text"].to_list() for item in sublist]))

In [8]:
class WordIndexerNumpy():
    def __init__(self, embedding_corpus):
        self.to_idx_dict = {}
        self.to_word_lst = []
        self.vectorized_get_idx = np.vectorize(self.to_idx_dict.__getitem__)
        self.vectorized_get_word = np.vectorize(self.to_word_lst.__getitem__)
        for i in range(len(embedding_corpus)):
            self.to_idx_dict[embedding_corpus[i]] = i 
            self.to_word_lst.append(embedding_corpus[i])
    def to_idx(self, words_2d_array):
        return self.vectorized_get_idx(words_2d_array)
    def to_word(self, indices_2d_array):
        return self.vectorized_get_word(indices_2d_array)

In [9]:
qt_word_indexer = WordIndexerNumpy(qt_corpus_list)

In [10]:
X = np.stack(train_df["question_text"].progress_apply(qt_word_indexer.to_idx).to_list())


100%|█████████████████████████| 1306122/1306122 [00:22<00:00, 56825.47it/s]


In [11]:
y = train_df["target"].to_numpy().reshape([-1, 1])

In [12]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, stratify=y, test_size=0.4)

### Model construction 

In [13]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


class PositionEncodedSelfAttentionWithFeedforward(keras.layers.Layer):
    def __init__(self, d_model, max_sequence_length = 5000, regularizer=None):
        keras.layers.Layer.__init__(self)
        self.attention = keras.layers.MultiHeadAttention(num_heads=4, key_dim = d_model, kernel_regularizer=regularizer, bias_regularizer=regularizer)
        self.batch_norm_1 = keras.layers.LayerNormalization()
        self.feed_forward = keras.layers.Dense(d_model, activation= "relu", kernel_regularizer=regularizer, bias_regularizer=regularizer)
        self.batch_norm_2 = keras.layers.LayerNormalization()
        self.positional_encoding_matrix = positional_encoding(max_sequence_length, d_model)
    def call(self, X):
        X = X + self.positional_encoding_matrix[:,:X.shape[1], :]
        X1 = self.attention(X, X)
        X1 = X1 + X
        X1 = self.batch_norm_1(X1)
        X2 = self.feed_forward(X1)
        X2 = X2 + X1 
        X2 = self.batch_norm_2(X2)
        return X2


In [39]:
NUM_EMBEDDINGS = len(qt_corpus_list)

def modelmaker(regularizer): 
    return keras.Sequential([
        keras.layers.Embedding(input_dim=NUM_EMBEDDINGS, output_dim=256),
        PositionEncodedSelfAttentionWithFeedforward(d_model=256, regularizer=regularizer),
        PositionEncodedSelfAttentionWithFeedforward(d_model=256, regularizer=regularizer),
        keras.layers.Lambda(lambda x : tf.math.reduce_mean(x, axis=1)),
        keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizer, bias_regularizer=regularizer),
        keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizer, bias_regularizer=regularizer), 
        keras.layers.Dense(8, activation='relu', kernel_regularizer=regularizer, bias_regularizer=regularizer),
        keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=regularizer, bias_regularizer=regularizer)
    ])

In [40]:
BATCH_SIZE = 2048
L2_REG_AMOUNT = 0.05
GRAD_CLIP = 10000 
LEARNING_RATE = 0.01
N_EPOCHS = 1000
model = modelmaker(tf.keras.regularizers.L2(l2=L2_REG_AMOUNT))
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm = GRAD_CLIP),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()]
)

In [41]:
model.fit(x=X_train,y=y_train, validation_data=(X_test, y_test), epochs=N_EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/1000
Epoch 2/1000

KeyboardInterrupt: 