## Genrate one of the magic features

#### WHAT???

We have done some word correction approach in data pre-processing phase. And it suddenly comes to me that the administator of this competition once said that "there exists some machine generated questions in the data set". And I guess some of those machine generated questions and simply adding typos. So the count of typos may shows some evidence of data distribution.

In [2]:
import os
import re
import csv
import codecs
import pickle
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import RMSprop, Nadam

from sklearn.preprocessing import StandardScaler

import sys

Using TensorFlow backend.


In [3]:
########################################
## set directories and parameters
########################################
BASE_DIR = '../input/'
EMBEDDING_FILE = '../../dataset/glove/glove.6B.300d.txt'
TRAIN_DATA_FILE = '../../dataset/quora-question-pairs/train.csv'
TEST_DATA_FILE = '../../dataset/quora-question-pairs/test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

In [None]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE, encoding='utf-8')
count = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

In [27]:
########################################
## Spell Checker with Glove
########################################

# Import spacy corpus, glove embeddings.
import spacy
import textacy

print('Loading SpaCy `en_core_web_md` corpus...')
nlp = spacy.load('en_core_web_md')

def word_prob(word):
    "Probability of word."
    return nlp.vocab[word].prob

def correction(word):
    "Most probable spelling correction for word."
    if nlp.vocab[word].prob > -15.0: 
        return word, False
    else:
        return max(candidates(word), key=word_prob), True

def candidates(word):
    "Generate possible spelling corrections for word."
    return set([word] + known([word]) + known(edits1(word)) + known(edits2(word)))

def known(words):
    "The subset of `words` that appear in the vocabulary."
    return [w for w in words if w in nlp.vocab]

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

abbr_dict = {
    "\'s":" is",
    "\'re":" are",

    "i'm":"i am",
    "it's":"it is",
    "\'ve":" have",

    "\'ll":" will",

    "won't":"will not",
    "can't":"can not",
    "\'t":" not",
    
}

def text_to_wordlist(text, remove_stopwords=False, stem_words=False, correct_words=True):
    
    if type(text)!=str:
        text = ''
    
    # Remove spaces.
    text = textacy.preprocess_text(text, lowercase=True)

    # Handle Abbreviation.
    for k, v in abbr_dict.items():
        text = re.sub(r"{}".format(k), v, text)

    if correct_words:
        # Spelling correction
        corrected_words = []
        modified_count = 0
        for doc in nlp(text):
            # Don't correct special words
            if doc.ent_type_ is not '' or doc.pos_ in ['PROPN', 'PUNCT']:
                corrected_words.append(doc.text)
            else:
                corrected, is_modified = correction(doc.text)
                corrected_words.append(corrected)
                if is_modified:
                    modified_count += 1

        # Re-merge corrected words
        text = " ".join(corrected_words)
        

    # Text normalization
    text = re.sub('[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\\(\)\[\]\{\}\<\>\'\"]', ' ', text)

    # Convert to lower case, remove punctuations and further text normalization.
    text = textacy.preprocess_text(text, lowercase=True, no_punct=True, no_numbers=True, no_currency_symbols=True)

    return text

Loading SpaCy `en_core_web_md` corpus...


In [7]:
# load raw data
df_train = pd.read_csv('../dataset/quora-question-pairs/train.csv')
df_test = pd.read_csv('../dataset/quora-question-pairs/test.csv')

In [5]:
# laod corrected lists
[texts_1, texts_2, labels] = pickle.load(open('./lystdo_kernel/train_text_processed.pkl','rb'))
[test_texts_1, test_texts_2, test_ids] = pickle.load(open('./lystdo_kernel/test_text_processed.pkl','rb'))

In [59]:
def get_diff(t1, t2, debug=False):
    t1 = t1.split(' ')
    t2 = t2.split(' ')
    
    count = 0
    min_len = min(len(t1),len(t2))
    
    if len(t1)!=len(t2):

        idx_1 = 0
        idx_2 = 0

        for i in range(min_len):
            
            # reaches end
            if idx_1>=min_len or idx_2>=min_len:
                break
                
            if debug:
                print(i,  t1[idx_1], t2[idx_2])

            if t1[idx_1]!=t2[idx_2]:

                is_match = False

                # try to find two words afterward on q1
                try:
                    for extra in range(1,2):
                        if t1[idx_1+extra]==t2[idx_2]:
                            idx_1 += extra+1
                            idx_2 += 1
                            count += extra
                            is_match = True
                            break
                    if is_match:
                        continue
                except:
                    pass # out of index

                # try to find two words afterward on q2
                try:
                    for extra in range(1,2):
                        if t1[idx_1]==t2[idx_2+extra]:
                            idx_1 += 1
                            idx_2 += extra+1
                            count += extra
                            is_match = True
                            break
                    if is_match:
                        continue
                except:
                    pass # out of index

                count += 1
            idx_1 += 1
            idx_2 += 1

    else:
        for i in range(min_len):
            if t1[i]!=t2[i]:
                count += 1
    return count

# a = 'how would you find the zero of the function for math y frac 4x number number xnumber math'
# b = 'how would you find the zero of the function for math y frac 4x number 36x x number math'

a = 'how do i enter a new email i d which i can access to recover my facebook i d'
b = 'how do i enter a new email id which i can access to recover my facebook id'

get_diff(a,b, debug=True)

0 how how
1 do do
2 i i
3 enter enter
4 a a
5 new new
6 email email
7 i id
8 d which
9 i i
10 can can
11 access access
12 to to
13 recover recover
14 my my
15 facebook facebook


2

In [60]:
# process training set

SET = 'train'
# SET = 'test'

if SET == 'train':
    df = df_train
    corrected_1 = texts_1
    corrected_2 = texts_2
else:
    df = df_test
    corrected_1 = test_texts_1
    corrected_2 = test_texts_2

result_q1 = []
result_q2 = []


total_len = len(df)
for i,series in df.iterrows():
    
    without_correction_1 = text_to_wordlist(series['question1'], correct_words=False)
    without_correction_2 = text_to_wordlist(series['question2'], correct_words=False)
    
    try:
        diff_1 = get_diff(corrected_1[i],without_correction_1)
        diff_2 = get_diff(corrected_2[i],without_correction_2)
    except:
        print('Exception at idx', i)
        print(corrected_1[i] + '\n' + without_correction_1)
        print()
        print(corrected_2[i] + '\n' + without_correction_2)
        break
    
    result_q1.append(diff_1)
    result_q2.append(diff_2)
    
    if i%100000==0:
        print(i, '/', total_len)

0 / 404290
100000 / 404290


KeyboardInterrupt: 

In [61]:
df_result = pd.DataFrame(np.array([result_q1,result_q2]).reshape(-1,2), columns=['correction_count1', 'correction_count2'])
# df_result.to_csv('./features_from_model/'+SET+'/correction_count.csv')
df_result

Unnamed: 0,correction_count1,correction_count2
0,0,0
1,0,0
2,1,0
3,1,0
4,1,1
5,2,0
6,0,0
7,0,2
8,0,0
9,1,1


In [None]:
 text_to_wordlist(text, remove_stopwords=False, stem_words=False, correct_words=True)