In [7]:
from itertools import islice
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re

nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\puttuk1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def load_glove_cc():

    word2vec = {}
    with open(r'C:\Users\puttuk1\Downloads\glove.840B.300d.txt', 'r', encoding='utf8') as f:
        for line in islice(f, 0, None):
            line = line.strip().split(' ')
            if len(line) > 300:
                words = line[:-300]
                vec = [float(x) for x in line[-300:]]
                for word in words:
                    word = word.lower()
                    word = re.sub('[^a-z0-9]', '', word)
                    if word:
                        word2vec[word] = vec
    return word2vec

print('Loading pre-trained word vectors from https://nlp.stanford.edu/projects/glove/...')
word2vec = load_glove_cc()
print('Loaded Common Crawl (840B, 300D) word vectors')

Loading pre-trained word vectors from https://nlp.stanford.edu/projects/glove/...
Loaded Common Crawl (840B, 300D) word vectors


In [9]:
print('Reading Quora question pairs...')
samples = []
with open(r'C:\Users\puttuk1\Downloads\quora_duplicate_questions.tsv', 'r', encoding='utf8') as f:
    for line in islice(f, 1, None):
        line = line.strip().split('\t')
        if len(line) == 6:
            samples.append(line[3:])            
print('Read ' + str(len(samples)) + ' samples')

Reading Quora question pairs...
Read 404279 samples


In [10]:
def preprocess(text):

    x = text.lower()
    x = re.sub("[^a-z0-9']", " ", x)
    x = re.sub("([0-9]+)000000000", "billion", x)
    x = re.sub("([0-9]+)000000", "million", x)
    x = re.sub("([0-9]+)000", "thousand", x)
    x = x.replace("'m", " am")\
        .replace("'s", " is")\
        .replace("'ll", " will")\
        .replace("'re", " are")\
        .replace("'ve", " have")\
        .replace("can't", "can not")\
        .replace("won't", "will not")\
        .replace("n't", " not")\
        .replace("'", " ")
    x = re.sub(" +", " ", x)
    return x

def compute_similarity(q1, q2, w2v):

    dflt = [0 for i in range(300)]
    v1 = dflt
    for x in q1:
        v = w2v.get(x, dflt)
        v1 = [max([v1[i], v[i]]) for i in range(len(v))]
    v2 = dflt
    for x in q2:
        v = w2v.get(x, dflt)
        v2 = [max([v2[i], v[i]]) for i in range(len(v))]
    dist = cosine_similarity([v1], [v2])[0][0]
    return dist

print('Generating features for question pairs...')
print('1. ratio of the difference in token sizes to the mean token size')
print('2. ratio of the count of common tokens to the mean token size')
print('3. ratio of the count of common tokens (non-stop words) to the mean token size (non-stop words)')
print('4. flag represnting the equality of the first token')
print('5. flag represnting the equality of the last token')
print('6. cosine similarity between the question pair')
print('...')
features = []
for line in samples:
    q1 = preprocess(line[0]).split(' ')
    q2 = preprocess(line[1]).split(' ')
    q1_non_stop = [x for x in q1 if x not in stop_words]
    q2_non_stop = [x for x in q2 if x not in stop_words]
    f1 = float(len(set(q1)) + len(set(q2))) / 2
    f2 = float(len(set(q1_non_stop)) + len(set(q2_non_stop))) / 2
    f3 = abs(len(set(q1)) - len(set(q2))) / f1
    f4 = len(set(q1).intersection(set(q2))) / f1
    f5 = len(set(q1_non_stop).intersection(set(q2_non_stop))) / f2
    f6 = int(q1[0] == q2[0])
    f7 = int(q1[-1] == q2[-1])
    f8 = compute_similarity(q1, q2, word2vec)
    x = ','.join(list(map(str, [f3, f4, f5, f6, f7, f8])))
    y = line[2]
    features.append(x + ',' + y)
    
with open(r'C:\Users\puttuk1\Downloads\quora_feature_space.csv', 'w') as f:
    for line in features:
        f.write(line + '\n')
print('Stored features for question pairs')

Generating features for question pairs...
1. ratio of the difference in token sizes to the mean token size
2. ratio of the count of common tokens to the mean token size
3. ratio of the count of common tokens (non-stop words) to the mean token size (non-stop words)
4. flag represnting the equality of the first token
5. flag represnting the equality of the last token
6. cosine similarity between the question pair
...
Stored features for question pairs


In [15]:
X, Y = [], []
with open(r'C:\Users\puttuk1\Downloads\quora_feature_space.csv', 'r') as f:
    for line in islice(f, 0, None):
        line = line.strip().split(',')
        X.append(list(map(float, line[:-1])))
        Y.append(int(line[-1]))

print('Training GradientBoostingClassifier...')
x_train, y_train, x_test, y_test = X[:350000], Y[:350000], X[350000:], Y[350000:]
clf = GradientBoostingClassifier(verbose=1).fit(x_train, y_train)
score = clf.score(x_train, y_train)
print('Training score: ' + str(score))
score = clf.score(x_test, y_test)
print('Test score: ' + str(score))

Training GradientBoostingClassifier...
      Iter       Train Loss   Remaining Time 
         1           1.2778            1.55m
         2           1.2424            1.25m
         3           1.2133            1.04m
         4           1.1880           55.33s
         5           1.1664           52.78s
         6           1.1478           52.11s
         7           1.1321           51.49s
         8           1.1183           51.52s
         9           1.1060           53.40s
        10           1.0953           54.53s
        20           1.0372           59.34s
        30           1.0166           57.81s
        40           1.0084           53.26s
        50           1.0037           45.68s
        60           1.0006           36.86s
        70           0.9986           26.28s
        80           0.9966           16.53s
        90           0.9952            7.86s
       100           0.9935            0.00s
Training score: 0.711197142857
Test score: 0.712227564988
