In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms, models
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
from tqdm import tqdm
import math
from zipfile import ZipFile

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/train.csv


In [3]:
train_set = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
train_set, test_set = train_test_split(train_set, test_size=0.1)
print(len(train_set))
print(len(test_set))
train_set.head()

1175509
130613


Unnamed: 0,qid,question_text,target
444135,5706141e18a912aeccba,Has todays' technology made us impersonal and ...,0
796936,9c2a6938781d7d123281,What is the effect of mobile computing on the ...,0
763343,958cdb9a7a5326fc4bb5,How is Kairos related to writing?,0
584637,7289b6ba075ca6468614,What happened after Harry Potter and Ginny Wea...,0
208082,28b33d319557c4c6a7e7,Why do most girls hate boys who watch porn?,0


In [4]:
#preproccesing data pipeline 
import nltk  #Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

def to_lower(question_text):
    return question_text.lower()
    
def remove_numbers(question_text):
    return ''.join([c for c in question_text if not c.isdigit()])

def remove_punctuation(question_text):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    filtered_question_text_tokenized =tokenizer.tokenize(question_text)
    return filtered_question_text_tokenized

def remove_stopwords(question_text):
    
    filtered_question_text = [word for word in question_text if not word in stopwords.words()]
    return filtered_question_text

def preproccesing(question_text):
    question_text = to_lower(question_text)
    question_text = remove_numbers(question_text)
    question_text = remove_punctuation(question_text)
    question_text = remove_stopwords(question_text)
    question_text = ' '.join(question_text)
    return question_text

test = 'a testing for the question.'
preproccesing(test)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'testing question'

In [None]:
filtered_training_set = []
print(train_set['question_text'].shape)
for text in train_set['question_text'][:10000]:
    filtered_training_set.append(preproccesing(text))
#Analysis
train_set_numpy = np.array(filtered_training_set)
max=0
for i in train_set_numpy:
    if max < len(i.split()):
        max=len(i.split())
print(max)

In [5]:
with ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip', 'r') as zipObj:
   # Get a list of all archived file names from the zip
   listOfFileNames = zipObj.namelist()
   # Iterate over the file names
   zipObj.extract('GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin')

In [6]:
filepath = "GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embeddings_index = {}
from gensim.models import KeyedVectors
wv_from_bin = KeyedVectors.load_word2vec_format(filepath, binary=True) 
for word, vector in zip(wv_from_bin.vocab, wv_from_bin.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings_index[word] = coefs
    


In [7]:
def word_embedding(text):
#word2vec-GoogleNews-vectors
#This repository hosts the word2vec pre-trained Google News corpus (3 billion running words)
#word vector model (3 million 300-dimension English word vectors).
    embeddings_dim = 300
    max_num_word = 30

    zero_embeddings = np.zeros(embeddings_dim)
    
    #text = preproccesing(text)
    #max num of words in  text is 35 
    text = text.split()[:max_num_word]
    embeddings = [embeddings_index.get(word, zero_embeddings) for word in text]
    #print(len(embeddings[0]))
    #print([zero_embeddings] * (35 - len(embeddings)))
    embeddings = embeddings + [zero_embeddings] * (max_num_word - len(embeddings))
    embeddings = np.array(embeddings)
    #print(embeddings.shape)
    return embeddings

In [8]:
batch_size = 64
epochs=10
steps_per_epoch=500
learning_rate=0.1

In [9]:
def batch_generation(train_set):
    num_batches = math.ceil(len(train_set) / batch_size)
    while True: 
        #return random sample 
        train_set = train_set.sample(frac=1.)  
        for i in range(num_batches):
            batch_texts = train_set.iloc[i*batch_size:(i+1)*batch_size,1]
            batch_texts_array = np.array([word_embedding(text) for text in batch_texts])
            batch_targets = np.array(train_set["target"][i*batch_size:(i+1)*batch_size])
            yield batch_texts_array , batch_targets

In [10]:
test_x = np.array([word_embedding(data) for data in test_set["question_text"][:2000]])
test_y = np.array(test_set["target"][:2000])
print(test_x.shape)

(2000, 30, 300)


In [11]:
from tensorflow import keras

from keras.models import Sequential

from keras.layers import LSTM, Dense, Bidirectional,Dropout,Embedding

model = Sequential()

#model = Embedding(max_num_word, embeddings_dim, weights=[embedding_matrix],trainable=False)

model.add(Bidirectional(LSTM(64, return_sequences=True),input_shape=(30, 300)))

model.add(Dropout(0.2))   

model.add(Dense(1, activation="sigmoid"))

In [12]:
opt = keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [13]:
#.fit is used when the entire training dataset can fit into the memory and no data augmentation is applied.
#.fit_generator is used when either we have a huge dataset to fit into our memory or when data augmentation needs to be applied.
train_loaders = batch_generation(train_set)
model.fit_generator(train_loaders, epochs=epochs,
                    steps_per_epoch=steps_per_epoch,
                    validation_data=(test_x, test_y),
                    verbose=True)    #show trainning progress

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f45f7d62b10>