This notebook is used for outputting the design matrix as <code>X.csv</code>.

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import marisa_count_vectorizer
import marisa_vectorizer
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.model_selection import train_test_split
import xgboost as xgb

import scorer
%matplotlib inline



In [2]:
df_train = pd.read_csv("train.csv")
df_train.replace(np.nan, "", regex=True, inplace=True)
y = df_train["is_duplicate"].values.astype(int)

In [3]:
try:
    X_df = pd.read_csv("X_df.csv")
except:
    X_df = pd.DataFrame()

# cosine-similarity of word count

In [2]:
all_qns = pd.read_csv("all_qns.csv", encoding="latin1", header=None)

In [3]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for token in tokens:
        stems.append(SnowballStemmer().stem(token))
    return stems

In [4]:
vectorizer = HashingVectorizer(tokenizer=tokenize, stop_words=set(stopwords.words("english")))

In [5]:
vectorizer.fit(np.asarray(all_qns.values.astype(str)))

MemoryError: 

ModuleNotFoundError: No module named 'wmi'

# cosine-similarity of TF-IDF transform (X1)

In [2]:
all_qns = pd.read_csv("all_qns.csv", encoding="latin1", header=None)

In [3]:
vectorizer = marisa_vectorizer.MarisaTfidfVectorizer(ngram_range=(1, 2), stop_words=stopwords.words("english"))
vectorizer

MarisaTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None,
           smooth_idf=True,
           stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', '...aven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'],
           strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)

In [4]:
vectorizer.fit(all_qns[0].dropna().values)

MarisaTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None,
           smooth_idf=True,
           stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', '...aven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'],
           strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)

In [6]:
v1 = vectorizer.transform(df_train["question1"].values.astype(str))
v2 = vectorizer.transform(df_train["question2"].values.astype(str))
X = paired_cosine_distances(v1, v2).reshape(-1, 1)
y = df_train["is_duplicate"].values.astype(int)

In [7]:
scorer.scorer(X, y)

[0]	train-logloss:0.688714	valid-logloss:0.688762
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.652626	valid-logloss:0.653075
[20]	train-logloss:0.627356	valid-logloss:0.628104
[30]	train-logloss:0.609121	valid-logloss:0.610089
[40]	train-logloss:0.595677	valid-logloss:0.596832
[50]	train-logloss:0.585652	valid-logloss:0.586937
[60]	train-logloss:0.578099	valid-logloss:0.579491
[70]	train-logloss:0.572388	valid-logloss:0.573862
[80]	train-logloss:0.56803	valid-logloss:0.569574
[90]	train-logloss:0.564691	valid-logloss:0.566303
[100]	train-logloss:0.562122	valid-logloss:0.563779
[110]	train-logloss:0.560128	valid-logloss:0.56183
[120]	train-logloss:0.55859	valid-logloss:0.560331
[130]	train-logloss:0.557396	valid-logloss:0.559167
[140]	train-logloss:0.556465	valid-logloss:0.558273
[150]	train-logloss:0.555747	valid-logloss:0.557584
[160]	train-logloss:0.555189	va

0.55524520044421399

In [7]:
X_df["tf_idf"] = X

Unnamed: 0,tf_idf,exists_number_qn1,exists_number_qn2,number_similarity,word_difference,harsh_proper_noun_eq,proportional_proper_noun_eq,feature_hashing
0,0.053528,0,0,1.0,2,1.0,1.0,0.055089
1,0.391289,0,0,1.0,5,0.0,1.0,0.416667
2,0.911846,0,0,1.0,4,0.0,0.6,0.634852
3,1.0,0,1,0.0,6,0.0,0.333333,1.0
4,0.929178,0,0,1.0,8,0.0,1.0,0.580686


# cosine-similarity of feature hashing

In [14]:
vectorizer = HashingVectorizer(non_negative=False)
w1 = vectorizer.transform(df_train["question1"].values.astype(str))
w2 = vectorizer.transform(df_train["question2"].values.astype(str))
X2 = paired_cosine_distances(w1, w2).reshape(-1, 1)

In [18]:
X_df["feature_hashing"] = X2

In [10]:
scorer.scorer(X_df.values, y)

[0]	train-logloss:0.688152	valid-logloss:0.688198
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.647206	valid-logloss:0.647643
[20]	train-logloss:0.61826	valid-logloss:0.619048
[30]	train-logloss:0.597305	valid-logloss:0.59838
[40]	train-logloss:0.581771	valid-logloss:0.583076
[50]	train-logloss:0.570039	valid-logloss:0.571527
[60]	train-logloss:0.561117	valid-logloss:0.562774
[70]	train-logloss:0.554227	valid-logloss:0.556021
[80]	train-logloss:0.548918	valid-logloss:0.550827
[90]	train-logloss:0.544761	valid-logloss:0.546764
[100]	train-logloss:0.541555	valid-logloss:0.543631
[110]	train-logloss:0.538985	valid-logloss:0.541127
[120]	train-logloss:0.536938	valid-logloss:0.539134
[130]	train-logloss:0.535267	valid-logloss:0.537519
[140]	train-logloss:0.533898	valid-logloss:0.536209
[150]	train-logloss:0.532862	valid-logloss:0.535235
[160]	train-logloss:0.53203	va

# Existence of numbers

In [53]:
num_checker_1 = lambda row: any(str.isdigit(char) for char in row["question1"])
num_checker_2 = lambda row: any(str.isdigit(char) for char in row["question2"])
exists_num_1 = df_train.apply(num_checker_1, axis=1, raw=True)
exists_num_2 = df_train.apply(num_checker_2, axis=1, raw=True)

In [54]:
X_df["exists_number_qn1"] = exists_num_1.astype(int)
X_df["exists_number_qn2"] = exists_num_2.astype(int)

In [62]:
scorer.scorer(X_df.values, y)

[0]	train-logloss:0.688152	valid-logloss:0.688198
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.647206	valid-logloss:0.647643
[20]	train-logloss:0.618253	valid-logloss:0.619048
[30]	train-logloss:0.597276	valid-logloss:0.598373
[40]	train-logloss:0.581705	valid-logloss:0.583049
[50]	train-logloss:0.56992	valid-logloss:0.571469
[60]	train-logloss:0.560991	valid-logloss:0.56271
[70]	train-logloss:0.554084	valid-logloss:0.555947
[80]	train-logloss:0.548749	valid-logloss:0.550731
[90]	train-logloss:0.544612	valid-logloss:0.54669
[100]	train-logloss:0.541406	valid-logloss:0.543562
[110]	train-logloss:0.538833	valid-logloss:0.541063
[120]	train-logloss:0.536778	valid-logloss:0.539072
[130]	train-logloss:0.535113	valid-logloss:0.53748
[140]	train-logloss:0.53371	valid-logloss:0.536112
[150]	train-logloss:0.532656	valid-logloss:0.535126
[160]	train-logloss:0.531821	vali

array([1, 1, 0, ..., 1, 0, 1])

The score after adding the presence of numbers is 0.525

# Number similarity

In [98]:
def number_similarity(row):
    str1 = row["question1"]
    str2 = row["question2"]
    str1 = "".join(c for c in str1 if (c.isspace() or c.isdecimal()))
    str2 = "".join(c for c in str2 if (c.isspace() or c.isdecimal()))
    numbers1 = str1.split()
    numbers2 = str2.split()
    numbers1.sort()
    numbers2.sort()
    
    if len(numbers1) == 0 and len(numbers2) == 0:
        return 1.
    
    elif len(numbers1) == 0 or len(numbers2) == 0:
        return 0.
    
    if len(numbers1) > len(numbers2):
        numbers1, numbers2 = numbers2, numbers1
        
    def digit_similarity(num1, num2):
        num1 = str(num1)
        num2 = str(num2)
        if len(num1) > len(num2):
            num1, num2 = num2, num1
        
        l = len(num2)
        score = 0
        for i, d in enumerate(num1):
            if num2[i] == d:
                score += 1
            else:
                break
        return score/l
    
    result_list = []
    for i, num in enumerate(numbers1):
        result_list.append(max(map(lambda x: digit_similarity(x, num), numbers2)))
    return sum(map(float, result_list))/len(result_list)

In [99]:
ns = df_train.apply(number_similarity, axis=1)

In [100]:
print(ns)

0         1.0
1         1.0
2         1.0
3         0.0
4         1.0
5         1.0
6         1.0
7         1.0
8         1.0
9         1.0
10        1.0
11        1.0
12        1.0
13        1.0
14        1.0
15        0.0
16        1.0
17        1.0
18        1.0
19        1.0
20        1.0
21        1.0
22        1.0
23        0.0
24        1.0
25        1.0
26        1.0
27        1.0
28        1.0
29        1.0
         ... 
404260    0.2
404261    1.0
404262    1.0
404263    0.0
404264    1.0
404265    1.0
404266    0.0
404267    1.0
404268    1.0
404269    1.0
404270    1.0
404271    0.0
404272    1.0
404273    1.0
404274    1.0
404275    1.0
404276    1.0
404277    1.0
404278    1.0
404279    0.0
404280    1.0
404281    1.0
404282    1.0
404283    1.0
404284    1.0
404285    1.0
404286    1.0
404287    1.0
404288    1.0
404289    1.0
dtype: float64


In [102]:
X_df["number_similarity"] = ns

In [103]:
scorer.scorer(X_df.values, y)

[0]	train-logloss:0.688122	valid-logloss:0.688168
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.646925	valid-logloss:0.647344
[20]	train-logloss:0.61779	valid-logloss:0.618564
[30]	train-logloss:0.596657	valid-logloss:0.597742
[40]	train-logloss:0.580852	valid-logloss:0.582123
[50]	train-logloss:0.568755	valid-logloss:0.570177
[60]	train-logloss:0.559455	valid-logloss:0.561021
[70]	train-logloss:0.552199	valid-logloss:0.553874
[80]	train-logloss:0.546565	valid-logloss:0.548336
[90]	train-logloss:0.542183	valid-logloss:0.544026
[100]	train-logloss:0.538621	valid-logloss:0.540526
[110]	train-logloss:0.535743	valid-logloss:0.537694
[120]	train-logloss:0.533426	valid-logloss:0.535409
[130]	train-logloss:0.531541	valid-logloss:0.533555
[140]	train-logloss:0.52998	valid-logloss:0.532022
[150]	train-logloss:0.528757	valid-logloss:0.530831
[160]	train-logloss:0.527721	v

array([1, 1, 0, ..., 1, 0, 1])

The score is improved very slightly to 0.520.

# Number of words?

In [107]:
tokenizer = StringTokenizer()


In [125]:
num_words = df_train.apply(lambda row: abs(len(nltk.word_tokenize(row["question1"])) - len(nltk.word_tokenize(row["question2"]))), axis=1)

In [128]:
X_df["word_difference"] = num_words 

In [134]:
y_pred = scorer.scorer(X_df.values, y)

[0]	train-logloss:0.688091	valid-logloss:0.688131
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.646582	valid-logloss:0.646912
[20]	train-logloss:0.617087	valid-logloss:0.617714
[30]	train-logloss:0.595648	valid-logloss:0.596553
[40]	train-logloss:0.579481	valid-logloss:0.580555
[50]	train-logloss:0.567027	valid-logloss:0.568234
[60]	train-logloss:0.557361	valid-logloss:0.558708
[70]	train-logloss:0.549755	valid-logloss:0.551178
[80]	train-logloss:0.543739	valid-logloss:0.545224
[90]	train-logloss:0.538838	valid-logloss:0.540349
[100]	train-logloss:0.534939	valid-logloss:0.536488
[110]	train-logloss:0.531733	valid-logloss:0.53331
[120]	train-logloss:0.529004	valid-logloss:0.530599
[130]	train-logloss:0.526759	valid-logloss:0.528371
[140]	train-logloss:0.524979	valid-logloss:0.52661
[150]	train-logloss:0.523472	valid-logloss:0.525101
[160]	train-logloss:0.522279	v

Score improved to 0.513.

In [130]:
X_df.head()

Unnamed: 0,tf_idf,featue_hashing,exists_number_qn1,exists_number_qn2,number_similarity,word_difference
0,0.053528,0.055089,0,0,1.0,2
1,0.391289,0.416667,0,0,1.0,5
2,0.911846,0.634852,0,0,1.0,4
3,1.0,1.0,0,1,0.0,6
4,0.929178,0.580686,0,0,1.0,8


# Proper nouns

In [140]:
sentence = "There are two optional keyword-only arguments. The key argument specifies a one-argument ordering function like that used for list.sort(). The default argument specifies an object to return if the provided iterable is empty. If the iterable is empty and default is not provided, a ValueError is raised."
tagged_sent = pos_tag(sentence.split())
print(tagged_sent)

[('There', 'EX'), ('are', 'VBP'), ('two', 'CD'), ('optional', 'JJ'), ('keyword-only', 'JJ'), ('arguments.', 'IN'), ('The', 'DT'), ('key', 'JJ'), ('argument', 'NN'), ('specifies', 'VBZ'), ('a', 'DT'), ('one-argument', 'JJ'), ('ordering', 'NN'), ('function', 'NN'), ('like', 'IN'), ('that', 'DT'), ('used', 'VBD'), ('for', 'IN'), ('list.sort().', 'IN'), ('The', 'DT'), ('default', 'NN'), ('argument', 'NN'), ('specifies', 'VBZ'), ('an', 'DT'), ('object', 'NN'), ('to', 'TO'), ('return', 'VB'), ('if', 'IN'), ('the', 'DT'), ('provided', 'VBN'), ('iterable', 'NN'), ('is', 'VBZ'), ('empty.', 'JJ'), ('If', 'IN'), ('the', 'DT'), ('iterable', 'NN'), ('is', 'VBZ'), ('empty', 'JJ'), ('and', 'CC'), ('default', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('provided,', 'VB'), ('a', 'DT'), ('ValueError', 'NNP'), ('is', 'VBZ'), ('raised.', 'JJ')]


In [141]:
for tup in tagged_sent:
    if tup[1] == "NNP":
        print(tup)

('ValueError', 'NNP')


In [35]:
def harsh_proper_noun_equality(row):
    global n
    proper_nouns_set1 = set(word for word, wordtype in pos_tag(row["question1"]) if wordtype == "NNP")
    proper_nouns_set2 = set(word for word, wordtype in pos_tag(row["question2"]) if wordtype == "NNP")
    n += 1
    if n % 1000 == 0:
        print(n // 1000)
        
    if len(proper_nouns_set1) == 0 and len(proper_nouns_set2) == 0:
        return np.nan
    
    if set(proper_nouns_set1) == set(proper_nouns_set2):
        return 1
    
    else:
        return 0
    
def proportional_proper_noun_equality(row):
    global n
    n += 1
    if n % 1000 == 0:
        print(n // 1000)
        
    if not (type(row["question1"]) == str and type(row["question2"])) == str:
        return 0
        
    proper_nouns_set1 = set(word for word, wordtype in pos_tag(row["question1"]) if wordtype == "NNP")
    proper_nouns_set2 = set(word for word, wordtype in pos_tag(row["question2"]) if wordtype == "NNP")
    if len(proper_nouns_set1) == 0 and len(proper_nouns_set2) == 0:
        return np.nan
    
    elif len(proper_nouns_set1) == 0 or len(proper_nouns_set2) == 0:
        return 0
    
    intersect = proper_nouns_set1 & proper_nouns_set2
    numerator = len(intersect)
    denominator = min(map(len, [proper_nouns_set1, proper_nouns_set2]))
    
    return numerator/denominator
    
n = 0

In [151]:
hpne = df_train.apply(harsh_proper_noun_equality, axis=1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [152]:
X_df["harsh_proper_noun_eq"] = hpne

In [76]:
X_df.to_csv("X_df.csv", index=False)

In [155]:
scorer.scorer(X_df.values, y)

[0]	train-logloss:0.688034	valid-logloss:0.688062
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.646112	valid-logloss:0.646383
[20]	train-logloss:0.616026	valid-logloss:0.616589
[30]	train-logloss:0.593923	valid-logloss:0.59474
[40]	train-logloss:0.577308	valid-logloss:0.578325
[50]	train-logloss:0.564601	valid-logloss:0.565761
[60]	train-logloss:0.554738	valid-logloss:0.556032
[70]	train-logloss:0.546863	valid-logloss:0.548275
[80]	train-logloss:0.540589	valid-logloss:0.542079
[90]	train-logloss:0.535447	valid-logloss:0.536991
[100]	train-logloss:0.531276	valid-logloss:0.532876
[110]	train-logloss:0.52781	valid-logloss:0.529472
[120]	train-logloss:0.524973	valid-logloss:0.526685
[130]	train-logloss:0.522612	valid-logloss:0.524354
[140]	train-logloss:0.520688	valid-logloss:0.522437
[150]	train-logloss:0.519044	valid-logloss:0.520812
[160]	train-logloss:0.517663	v

array([1, 1, 0, ..., 1, 0, 1])

Score improved to 0.5072

In [None]:
from sklearn.svm import SVC
svc = SVC(verbose=100, C=0.1)
svc.fit(X_df.values, y)


[LibSVM]

In [45]:
X_df["harsh_proper_noun_eq"].fillna(X_df["harsh_proper_noun_eq"].mean(), inplace=True)

In [176]:
svc.score(X_df.values, y)

0.66226718444680799

In [7]:
X_df = pd.read_csv("X_df.csv")
df_train = pd.read_csv("train.csv")

In [3]:
X_df.shape

(404290, 7)

In [36]:
ppne = df_train.apply(proportional_proper_noun_equality, axis=1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [42]:
X_df["proportional_proper_noun_eq"].fillna(X_df["proportional_proper_noun_eq"].mean(), inplace=True)

In [21]:
df_train.loc[105780, "question1"]

'How can I develop android app?'

In [37]:
X_df["proportional_proper_noun_eq"] = ppne

In [39]:
scorer.scorer(X_df.values, df_train["is_duplicate"].values)

[0]	train-logloss:0.688034	valid-logloss:0.688062
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.645861	valid-logloss:0.646142
[20]	train-logloss:0.615713	valid-logloss:0.616252
[30]	train-logloss:0.593532	valid-logloss:0.594312
[40]	train-logloss:0.576912	valid-logloss:0.577879
[50]	train-logloss:0.564043	valid-logloss:0.565153
[60]	train-logloss:0.554048	valid-logloss:0.555292
[70]	train-logloss:0.546098	valid-logloss:0.547457
[80]	train-logloss:0.539833	valid-logloss:0.541256
[90]	train-logloss:0.53461	valid-logloss:0.536089
[100]	train-logloss:0.53035	valid-logloss:0.531894
[110]	train-logloss:0.526866	valid-logloss:0.528448
[120]	train-logloss:0.524016	valid-logloss:0.525643
[130]	train-logloss:0.521592	valid-logloss:0.52323
[140]	train-logloss:0.519616	valid-logloss:0.521301
[150]	train-logloss:0.517917	valid-logloss:0.519621
[160]	train-logloss:0.516575	va

array([1, 1, 0, ..., 1, 0, 1])

In [11]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100))

In [12]:
clf.fit(X_df.values, df_train["is_duplicate"].values)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [63]:
X_df.describe()

Unnamed: 0,tf_idf,featue_hashing,exists_number_qn1,exists_number_qn2,number_similarity,word_difference,harsh_proper_noun_eq,proportional_proper_noun_eq
count,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0
mean,0.667479,0.495216,0.116451,0.120092,0.896305,4.098682,0.318765,0.855398
std,0.282922,0.256946,0.320765,0.32507,0.298716,5.519718,0.465958,0.222409
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.480763,0.292893,0.0,0.0,1.0,1.0,0.0,0.666667
50%,0.744054,0.496047,0.0,0.0,1.0,2.0,0.0,1.0
75%,0.898292,0.683772,0.0,0.0,1.0,5.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,257.0,1.0,1.0


In [14]:
y_res = clf.predict_proba(X_df.values)

In [16]:
from sklearn.metrics import log_loss
print(log_loss(y, y_res))

0.493570603419


# Question type

In [6]:
def determine_qn_type(row):
    qn1 = nltk.word_tokenize(row["question1"])
    qn2 = nltk.word_tokenize(row["question2"])
    result = 1
    for qn_identifier in ["who", "where", "when", "how", "why"]:
        if not bool(qn_identifier in qn1) == bool(qn_identifier in qn2):
            result = 0
            return result
        
    for qn in [qn1, qn2]:
        if "what" in qn:
            result = 

SyntaxError: invalid syntax (<ipython-input-6-ba3db5f4537d>, line 12)

In [8]:
nltk.word_tokenize("How is your day?")

['How', 'is', 'your', 'day', '?']

In [12]:
pos_tag(nltk.word_tokenize("How is your day?"))

[('How', 'WRB'), ('is', 'VBZ'), ('your', 'PRP$'), ('day', 'NN'), ('?', '.')]