## Using Bag of Words

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.shape

(404290, 6)

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [22]:
new_df = df.sample(30000)

In [23]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [24]:
new_df.duplicated().sum()

0

In [25]:
ques_df = new_df[['question1','question2']]
ques_df

Unnamed: 0,question1,question2
385822,Can you lose weight without exercising?,How do I lose weight without doing exercise or...
192722,Where is the Milky Way Galaxy?,What is the Milky Way?
71279,I have been cheated by Quikr. What happened wa...,What do you call someone who neither supports ...
42190,How would a Spartan do in the UFC?,How would an MMA fighter fare against a Sparta...
266708,Is C++ slower than C?,Why is C or C++ faster than Python for HFT?
...,...,...
329335,How should I utilize the four-month break befo...,How should I utilize the 3 months break before...
188883,Do you skip the questions with long answers in...,Do you skip long answers on Quora?
195442,Why did humans alone evolve to become intellig...,What technicality results in humans being more...
36093,"Can I grow a weeping willow tree in Bangalore,...",Which is are the best trees to grow around the...


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features = 3000)
q1_arr,q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [27]:
temp_df1 = pd.DataFrame(q1_arr, index = ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index = ques_df.index)
temp_df = pd.concat([temp_df1,temp_df2], axis = 1)
temp_df.shape

(30000, 6000)

In [28]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
385822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71279,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
42190,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
266708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188883,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
195442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
temp_df["is_duplicate"] = new_df["is_duplicate"]

In [30]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
385822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
192722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71279,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
42190,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
266708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188883,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
195442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
36093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=42)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7448333333333333

In [37]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.7316666666666667