In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('train.csv')
df.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

We have an imbalanced dataset. 

I will undersample to deal with this problem because I have less computational power.

One could also use oversampling techniques like SMOTE.

In [3]:
sample_0=df[df['target']==0].sample(df.target.value_counts()[1])
sample_1=df[df['target']==1]
df=pd.concat([sample_0,sample_1])

In [4]:
from sklearn.model_selection import train_test_split
df, test_df = train_test_split(df, test_size=0.1,stratify=df['target'])

We use stratify to maintain the same ratio of classes in both train and test sets.

In [5]:
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

In [6]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(num_words=max_features,lower=True,filters='')
tokenizer.fit_on_texts(df['question_text'])

Using TensorFlow backend.


Tokenizer will convert tokenize the text and convert into a series of numbers unique to each word.

In [7]:
#Loading embeddings
EMBEDDING_FILE = 'glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding="utf-8"))


all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embeddings_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#Initialized with random normally distributed vectors with mean and standard deviation same as that of embeddings

#We are loading the word vectors for the words which they are available.
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embeddings_matrix[i] = embedding_vector

  import sys


In [8]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [9]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [10]:
df['question_text']=df['question_text'].apply(lambda x:clean_contractions(x,contraction_mapping))

In [11]:
def clean_text(x):

    x = str(x)
    for punct in "/-":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, ' & ')
    for punct in '?!.,"#$%\()*+-/:;<=>@[\\]^_{|}~“”':
        x = x.replace(punct, '')
    return x

In [12]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In the word embedding numbers are replaced by #. So have done the same.

In [13]:
df['question_text']=df['question_text'].apply(clean_text).apply(clean_numbers)

In [14]:
X_train=tokenizer.texts_to_sequences(df['question_text'])
X_test=tokenizer.texts_to_sequences(test_df['question_text'])
y_train=df['target']
y_test=test_df['target']

In [15]:
#All sequences must be of the same length.
from keras.preprocessing.sequence import pad_sequences
X_train=pad_sequences(X_train,maxlen=maxlen)
X_test=pad_sequences(X_test,maxlen=maxlen)

In [16]:
import keras
from keras.models import Sequential
from keras.layers import Embedding,Dense,LSTM,Dropout,GRU,Conv1D,GlobalMaxPooling1D,CuDNNLSTM,Bidirectional

In [18]:
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1,save_best_only=True)
callbacks=[mc]

The above code helps in creating model checkpoints. We monitor the loss on the validation set and save the model for which it is minimum.

In [1]:
# load a saved model
#from keras.models import load_model
#model = load_model('best_model.h5')

In [21]:
model=Sequential()
#We load the embedding weights into the embedding layer and set trainable=False to prevent the weights from changing.
model.add(Embedding(max_features, embed_size, weights=[embeddings_matrix], trainable=False))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

W1217 12:43:59.187619  8656 deprecation_wrapper.py:119] From C:\Users\Dell\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1217 12:43:59.429501  8656 deprecation_wrapper.py:119] From C:\Users\Dell\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1217 12:43:59.464385  8656 deprecation_wrapper.py:119] From C:\Users\Dell\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1217 12:43:59.489473  8656 deprecation_wrapper.py:119] From C:\Users\Dell\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1217 12:43:59.490270  8656 deprecation_wrapper.py:119] From

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         28500000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         187392    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               99328     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 28,786,849
Trainable params: 286,849
Non-trainable params: 28,500,000
_________________________________________________________________


In [22]:
model.fit(X_train,y_train,epochs=20,batch_size=128,validation_data=(X_test,y_test),callbacks=callbacks)

Train on 145458 samples, validate on 16162 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.32020, saving model to best_undersample_model.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.32020 to 0.30735, saving model to best_undersample_model.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.30735
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.30735
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.30735
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.30735
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.30735
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.30735
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.30735
Epoch 10/20

KeyboardInterrupt: 

In [47]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,test_pred)

0.9176882588111667

0.91 is a good ROC-AUC score