In [23]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')

## **Basic Preprocessing**

In [9]:
df.sample(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
120147,120147,194915,194916,What/who is the most transparent startup/found...,What are some of the most transparent startups?,0
236874,236874,347902,347903,How do I get rid of red bumps on my forearms?,How do I get rid of red bumps on my legs?,0
22431,22431,26823,42093,"What do you mean by ""Thug Life""?","What did Dave Barry mean by ""You should not co...",0
176652,176652,147975,271746,Most useful language?,What are the most useful computer programming/...,1
130342,130342,209202,66836,How do I become a professor?,How does one become a professor?,1
219165,219165,32774,91655,How much time does it take to learn JavaScript?,How much time do I need to learn JavaScript?,1
351238,351238,80462,165078,How does chlorophyll helps plants?,What does chlorophyll do?,1
265169,265169,307051,59464,What is good with the Indian education system?,What is good about Indian education system?,1
202898,202898,305337,305338,What was the worst job ever?,What was the worst job you ever had to take?,0
359332,359332,488969,488970,Is organic food really any better than normal ...,Is organic food better for health?,1


In [10]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [11]:
df.dropna(inplace=True)

In [None]:
df['question1'] = df['question1'].str.lower().str.strip()
df.shape()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor (koh-i-noor) dia...,What would happen if the Indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely? how can i solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [14]:
df['question2'] = df['question2'].str.lower().str.strip()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely? how can i solve...,find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"which one dissolve in water quikly sugar, salt...",which fish would survive in salt water?,0


## **Loading Pretrained Model for Encoding**

In [None]:
# ! pip install sentence-transformers

In [19]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# q1_vecs = model.encode(df['question1'].tolist(), show_progress_bar=True)

In [27]:
np.save('q1_embed.npy', q1_vecs)

In [None]:
# q2_vecs = model.encode(df['question2'].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 12634/12634 [30:23<00:00,  6.93it/s]   


In [28]:
np.save('q2_embed.npy', q2_vecs)

In [30]:
print(q1_vecs.shape)
print(q2_vecs.shape)

(404287, 384)
(404287, 384)


In [31]:
X = np.hstack([q1_vecs, q2_vecs])
y = df['is_duplicate'].values

## **Training Model**

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train,y_train)
y_pred1 = xgb.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.8417596279898093

## **Saving Model**

In [None]:
import pickle
pickle.dump(model, open('hf_encoder.pkl', 'wb'))
