In [4]:
import numpy as np
import pandas as pd

In [7]:
import xgboost as xgb

In [9]:
df_train = pd.read_csv('./train.csv', 
                       dtype={
                           'question1': np.str,
                           'question2': np.str
                       })
df_train['test_id'] = -1
df_test = pd.read_csv('./test.csv', 
                      dtype={
                          'question1': np.str,
                          'question2': np.str
                      })
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1

df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print (df.dtypes)
del(df_train, df_test)

id               int64
is_duplicate     int64
qid1             int64
qid2             int64
question1       object
question2       object
test_id          int64
dtype: object


In [10]:
df['len1'] = df['question1'].str.len()
df['len2'] = df['question2'].str.len()

In [11]:
df['is_first_word_equal'] = df.apply(
    lambda r: int(r['question1'].split(' ')[0] == r['question2'].split(' ')[0]), axis=1)

In [12]:
df['abs_diff_len1_len2'] = np.abs(df['len1'] - df['len2'])

In [6]:
ix_train = df['is_duplicate'] != -1
ix_test = df['is_duplicate'] == -1

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
from sklearn.model_selection import GridSearchCV, cross_val_score

tree_params = {'max_depth': range(2,10),
'max_features': range(1,5)}

tree_grid = GridSearchCV(tree, tree_params,
cv=5, n_jobs=-1,
verbose=True)

tree_grid.fit(df[features][ix_train], df['is_duplicate'][ix_train])

NameError: name 'tree' is not defined

In [10]:
features = ['len1', 'len2', 'is_first_word_equal', 'abs_diff_len1_len2']

In [11]:
from sklearn.ensemble import RandomForestClassifier

forest_params = {'max_depth': range(1,9),
'max_features': range(2,5)}

forest = RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=17)
forest_grid = GridSearchCV(forest, forest_params,
cv=5, n_jobs=-1,
verbose=True)

model = forest_grid.fit(df[features][ix_train], df['is_duplicate'][ix_train])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  8.4min finished


In [12]:
forest_grid.best_params_

{'max_depth': 8, 'max_features': 4}

In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=8, max_features=4, n_estimators=24, random_state=17)
model = forest.fit(df[features][ix_train], df['is_duplicate'][ix_train])

In [15]:
model.feature_importances_

array([ 0.12936173,  0.14048305,  0.39143388,  0.33872133])

In [16]:
prediction = model.predict_proba(df[features][ix_test])[:, 1]

In [17]:
prediction

array([ 0.29610066,  0.26117698,  0.41353768, ...,  0.27645182,
        0.36642456,  0.29286985])

In [18]:
print (prediction.shape)

(2345796,)


In [19]:
df_submit = df.loc[ix_test].copy()
df_submit['is_duplicate'] = prediction
df_submit[['test_id', 'is_duplicate']].to_csv('../submit.csv', index=False)

In [79]:
from textblob import TextBlob
from textblob import Word

In [22]:
import nltk

In [120]:
def remove_stopwodrs(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in TextBlob(text).words if w.lower() not in stopwords]
    return content

In [123]:
df['unique_words_number'] = df.apply(
    lambda r: len(set(remove_stopwodrs(r['question1']) + remove_stopwodrs(r['question2']))), axis = 1)

In [None]:
# разделить unique_words_number на количество слов
# искать общие синонимы (пересечение множест?), делить на длину
# посмотреть отдельно общие глаголы и существительные, относительно кол-ва слов, глянуть на значимость

In [124]:
df['unique_words_density'] = df.apply(
    lambda r: r['unique_words_number'] / (r['len1'] + r['len2']), axis = 1)

In [125]:
df.head()

Unnamed: 0_level_0,id,is_duplicate,qid1,qid2,question1,question2,test_id,len1,len2,is_first_word_equal,abs_diff_len1_len2,equal_words_number,unique_words_number,unique_words_density
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,-1,66,57,1,9,13,6,0.04878
1,1,0,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,-1,51,88,1,37,16,11,0.079137
2,2,0,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,-1,73,59,1,14,21,10,0.075758
3,3,0,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,-1,50,65,0,15,19,10,0.086957
4,4,0,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,-1,76,39,1,37,18,13,0.113043


In [126]:
def syn_words(text):
    syns = []
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in TextBlob(text).words if w.lower() not in stopwords]
    for x in content:
        for synset in Word(x).synsets:
            syns.append(synset.name().split('.')[0])
    return syns

In [119]:
%%time
syn_words('My mom is younger than yours, but she always looks beatyful')

Wall time: 0 ns


['ma',
 'younger',
 'young',
 'new',
 'youthful',
 'young',
 'unseasoned',
 'always',
 'constantly',
 'constantly',
 'always',
 'always',
 'expression',
 'look',
 'look',
 'spirit',
 'look',
 'look',
 'look',
 'search',
 'front',
 'attend',
 'look',
 'expect',
 'look',
 'count']

In [127]:
%%time
df['unique_syn'] = df.apply(
    lambda r: len(set(syn_words(r['question1'])).intersection(set(syn_words(r['question1'])))) / (r['len1'] + r['len2']), axis = 1)

Wall time: 15h 59min 27s


In [128]:
ix_train = df['is_duplicate'] != -1
ix_test = df['is_duplicate'] == -1
features = ['len1', 'len2', 'is_first_word_equal', 'abs_diff_len1_len2', 'unique_words_density', 'unique_syn']

In [135]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=10, max_features=1.0, n_estimators=24)
model = rf.fit(df[features][ix_train], df['is_duplicate'][ix_train])

In [136]:
prediction = model.predict_proba(df[features][ix_test])[:, 1]

In [137]:
df_submit = df.loc[ix_test].copy()
df_submit['is_duplicate'] = prediction
df_submit[['test_id', 'is_duplicate']].to_csv('submit.csv', index=False)

In [133]:
df.head()

Unnamed: 0_level_0,id,is_duplicate,qid1,qid2,question1,question2,test_id,len1,len2,is_first_word_equal,abs_diff_len1_len2,equal_words_number,unique_words_number,unique_words_density,unique_syn
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,-1,66,57,1,9,13,6,0.04878,0.243902
1,1,0,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,-1,51,88,1,37,16,11,0.079137,0.071942
2,2,0,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,-1,73,59,1,14,21,10,0.075758,0.113636
3,3,0,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,-1,50,65,0,15,19,10,0.086957,0.06087
4,4,0,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,-1,76,39,1,37,18,13,0.113043,0.147826
