In [40]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from collections import Counter
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional     scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
%matplotlib inline

pal = sns.color_palette()

In [41]:
df_train = pd.read_csv('input/quora/train.csv')
df_test = pd.read_csv('input/quora/test.csv')

In [42]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)

In [46]:
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1.0 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

stops = set(stopwords.words("english"))

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = float((len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words)))
    return R

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = float(np.sum(shared_weights) / np.sum(total_weights))
    return R

In [1]:
print('Most common words and weights: \n')
print(sorted(weights.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:10])
print('\nLeast common words and weights: ')
(sorted(weights.items(), key=lambda x: x[1], reverse=True)[:10])

Most common words and weights: 



NameError: name 'weights' is not defined

In [48]:
train_word_match = df_train.apply(word_match_share, axis=1, raw=True)
tfidf_train_word_match = df_train.apply(tfidf_word_match_share, axis=1, raw=True)



In [52]:
tfidf_train_word_match.value_counts()

0.000000    64377
1.000000    13826
1.000000       49
1.000000       48
0.893039       44
0.812253       36
1.000000       31
0.720358       21
0.653908       20
0.535150       20
0.789070       18
0.689557       16
0.769277       14
0.892802       13
0.673958       13
0.714628       13
0.693475       12
0.766908       12
0.855855       12
0.807560       12
0.595289       12
0.423964       12
0.507819       12
0.759408       12
0.624701       12
0.538326       12
0.805334       12
0.426455       11
0.549478       11
0.624715       11
            ...  
0.810970        1
0.746370        1
0.528710        1
0.834572        1
0.309853        1
0.337922        1
0.471321        1
0.580077        1
0.265327        1
0.628743        1
0.899223        1
0.105347        1
0.199025        1
0.648512        1
0.206956        1
0.271274        1
0.216511        1
0.886133        1
0.249378        1
0.775835        1
0.520043        1
0.658748        1
0.095845        1
0.448970        1
0.397379  

In [53]:
# First we create our training and testing data
x_train = pd.DataFrame()
x_test = pd.DataFrame()
x_train['word_match'] = train_word_match
x_train['tfidf_word_match'] = tfidf_train_word_match
x_test['word_match'] = df_test.apply(word_match_share, axis=1, raw=True)
x_test['tfidf_word_match'] = df_test.apply(tfidf_word_match_share, axis=1, raw=True)

y_train = df_train['is_duplicate'].values



In [54]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0


In [55]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic', nthread=4,     scale_pos_weight=1, seed=27), 
 param_grid = param_test1,     scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(x_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.78721, std: 0.00199, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.78719, std: 0.00209, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.78718, std: 0.00213, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.78874, std: 0.00177, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.78871, std: 0.00203, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.78880, std: 0.00186, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.78982, std: 0.00225, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.78983, std: 0.00213, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.78983, std: 0.00207, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.79098, std: 0.00205, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.79077, std: 0.00221, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.79083, std: 0.00216, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 9, 'min_child_weight': 1

In [57]:
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[1,2,3]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(     learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(x_train,y_train)
gsearch2.grid_scores_, gsearch2.best_params_,     gsearch2.best_score_



([mean: 0.79045, std: 0.00211, params: {'max_depth': 8, 'min_child_weight': 1},
  mean: 0.79044, std: 0.00204, params: {'max_depth': 8, 'min_child_weight': 2},
  mean: 0.79043, std: 0.00217, params: {'max_depth': 8, 'min_child_weight': 3},
  mean: 0.79098, std: 0.00205, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.79087, std: 0.00213, params: {'max_depth': 9, 'min_child_weight': 2},
  mean: 0.79077, std: 0.00221, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.79105, std: 0.00204, params: {'max_depth': 10, 'min_child_weight': 1},
  mean: 0.79107, std: 0.00220, params: {'max_depth': 10, 'min_child_weight': 2},
  mean: 0.79085, std: 0.00202, params: {'max_depth': 10, 'min_child_weight': 3}],
 {'max_depth': 10, 'min_child_weight': 2},
 0.7910727733412053)

In [58]:
param_test3 = {
 'max_depth':[9,10,11],
 'min_child_weight':[1,2,3]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(     learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(x_train,y_train)
gsearch3.grid_scores_, gsearch3.best_params_,     gsearch3.best_score_



([mean: 0.79045, std: 0.00211, params: {'max_depth': 8, 'min_child_weight': 1},
  mean: 0.79044, std: 0.00204, params: {'max_depth': 8, 'min_child_weight': 2},
  mean: 0.79043, std: 0.00217, params: {'max_depth': 8, 'min_child_weight': 3},
  mean: 0.79098, std: 0.00205, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.79087, std: 0.00213, params: {'max_depth': 9, 'min_child_weight': 2},
  mean: 0.79077, std: 0.00221, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.79105, std: 0.00204, params: {'max_depth': 10, 'min_child_weight': 1},
  mean: 0.79107, std: 0.00220, params: {'max_depth': 10, 'min_child_weight': 2},
  mean: 0.79085, std: 0.00202, params: {'max_depth': 10, 'min_child_weight': 3}],
 {'max_depth': 10, 'min_child_weight': 2},
 0.7910727733412053)

In [61]:
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch4.fit(x_train,y_train)
gsearch4.grid_scores_, gsearch4.best_params_,     gsearch4.best_score_



([mean: 0.79107, std: 0.00220, params: {'gamma': 0.0},
  mean: 0.79105, std: 0.00214, params: {'gamma': 0.1},
  mean: 0.79104, std: 0.00212, params: {'gamma': 0.2},
  mean: 0.79095, std: 0.00212, params: {'gamma': 0.3},
  mean: 0.79100, std: 0.00216, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.7910727733412053)

In [64]:
param_test5 = {
 'learning_rate':[i/100.0 for i in range(5,30,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.05, n_estimators=500, max_depth=10, min_child_weight=2, gamma=0,
                                                  subsample=0.75, colsample_bytree=0.85, objective= 'binary:logistic', nthread=4, 
                                                  scale_pos_weight=1,seed=27), param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch5.fit(x_train,y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.79179, std: 0.00194, params: {'learning_rate': 0.05},
  mean: 0.79156, std: 0.00209, params: {'learning_rate': 0.1},
  mean: 0.79109, std: 0.00222, params: {'learning_rate': 0.15},
  mean: 0.79018, std: 0.00167, params: {'learning_rate': 0.2},
  mean: 0.78935, std: 0.00176, params: {'learning_rate': 0.25}],
 {'learning_rate': 0.05},
 0.7917916997966594)

In [63]:
param_test6 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.05, n_estimators=500, max_depth=10, min_child_weight=2, gamma=0,
                                                  subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, 
                                                  scale_pos_weight=1,seed=27), param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(x_train,y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.79179, std: 0.00194, params: {'subsample': 0.75, 'colsample_bytree': 0.75},
  mean: 0.79181, std: 0.00196, params: {'subsample': 0.8, 'colsample_bytree': 0.75},
  mean: 0.79182, std: 0.00191, params: {'subsample': 0.85, 'colsample_bytree': 0.75},
  mean: 0.79179, std: 0.00194, params: {'subsample': 0.75, 'colsample_bytree': 0.8},
  mean: 0.79181, std: 0.00196, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.79182, std: 0.00191, params: {'subsample': 0.85, 'colsample_bytree': 0.8},
  mean: 0.79179, std: 0.00194, params: {'subsample': 0.75, 'colsample_bytree': 0.85},
  mean: 0.79181, std: 0.00196, params: {'subsample': 0.8, 'colsample_bytree': 0.85},
  mean: 0.79182, std: 0.00191, params: {'subsample': 0.85, 'colsample_bytree': 0.85}],
 {'colsample_bytree': 0.75, 'subsample': 0.85},
 0.7918152182432465)