In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
new_df = data.sample(50000)

In [5]:
new_df['q1_len'] = new_df['question1'].str.len()

In [6]:
new_df['q2_len'] = new_df['question2'].str.len()

In [7]:
new_df['q1_words'] = new_df['question1'].apply(lambda x: len(x.split(" ")))

In [8]:
new_df['q2_words'] = new_df['question2'].apply(lambda x: len(x.split(" ")))

In [9]:
def common_words(row):
    l1 = set(row['question1'].lower().strip().split(" "))
    l2 = set(row['question2'].lower().strip().split(" "))
    return len(l1 & l2)

In [10]:
def total_words(row):
    k1 = set(row['question1'].lower().strip().split(" "))
    k2 = set(row['question2'].lower().strip().split(" "))
    return len(k1) + len(k2)

In [11]:
new_df['common_words'] = new_df.apply(common_words, axis=1)

In [12]:
new_df['total_words'] = new_df.apply(total_words, axis=1)

In [13]:
new_df['word_share'] = round(new_df['common_words']/new_df['total_words'],2)

In [14]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,total_words,word_share
321083,321083,446740,446741,What is the future business for india?,What is the future business in India?,1,38,37,7,7,6,14,0.43
260493,260493,56865,59223,Which is the best earphone under 1000rs? Is so...,Which is the best earphone under 1000 INR?,1,80,42,15,8,6,22,0.27
158464,158464,247550,247551,B.To be able to get ideas blended by the other...,What would happen to a non profit corporation ...,0,91,86,19,15,1,30,0.03
328182,328182,454691,454692,Has the cost of the F-35 reached the 'too big ...,Why is Singapore buying the F-35 to safeguard ...,0,61,59,13,10,3,21,0.14
76506,76506,130777,130778,Which is the best processor i3 6098 or i3 6320?,Which is the best processor: i3 or i5?,0,47,38,10,8,6,17,0.35


In [15]:
new_df.shape

(50000, 13)

In [16]:
temp_df = new_df.iloc[:,5:]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(max_features=3000)

In [19]:
q1 = cv.fit_transform(new_df['question1']).toarray()

In [20]:
q2 = cv.fit_transform(new_df['question2']).toarray()

In [21]:
q1_df = pd.DataFrame(q1, index=new_df.index)

In [22]:
q2_df = pd.DataFrame(q2, index=new_df.index)

In [23]:
question_df = pd.concat([q1_df,q2_df], axis=1)

In [24]:
question_df.shape

(50000, 6000)

In [25]:
final_df = pd.concat([temp_df,question_df], axis=1)

In [26]:
final_df.head()

Unnamed: 0,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,total_words,word_share,0,1,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
321083,1,38,37,7,7,6,14,0.43,0,0,...,0,0,0,0,0,0,0,0,0,0
260493,1,80,42,15,8,6,22,0.27,0,0,...,0,0,0,0,0,0,0,0,0,0
158464,0,91,86,19,15,1,30,0.03,0,0,...,0,0,0,0,0,0,0,0,0,0
328182,0,61,59,13,10,3,21,0.14,0,0,...,0,0,0,0,0,0,0,0,0,0
76506,0,47,38,10,8,6,17,0.35,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X = final_df.iloc[:,1:].values

In [30]:
X.shape

(50000, 6007)

In [31]:
y = final_df.iloc[:,0].values

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rdf = RandomForestClassifier()

In [35]:
rdf.fit(X_train,y_train)

In [36]:
y_pred = rdf.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
accuracy_score(y_test, y_pred)

0.77856

In [39]:
from xgboost import XGBClassifier

In [40]:
xg = XGBClassifier()

In [41]:
xg.fit(X_train,y_train)

In [42]:
y_pred = xg.predict(X_test)

In [43]:
accuracy_score(y_test, y_pred)

0.77008