In [3]:
#In that part, we are seeing what are the name of our data sets by using os library

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


***Data Preprocessing***

In that step , we will begin to analyse our data, What we did is :
- Importing the libraries we need
- Defining our dataset using pandas
- Checking if there is any null or missing values using .isnull.sum()
- Dividing the questions columns to list to begin use it in our model

In [6]:
#Importing Libraies
import pandas as pd
import numpy as np
#Defining Our Data set
train_df = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip') 
test_df = pd.read_csv('/kaggle/input/quora-question-pairs/test.csv')

In [7]:
#Check if there is any null values
train_df.isnull().sum()


id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [8]:
#Delete Null Values
train_df=train_df.dropna()


In [9]:
#Turning Questions Columns into list
cor1 = list(train_df['question1']) 
cor2 = list(train_df['question2'])

In [10]:
#Make sure it turned to a list by checking first entry in the list
print(cor1[0])
print(cor2[0])

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?


***Modelling***

We will use SentenceBERT to make our model. SentenceBERT is a technique for sentence embeddings. Sentence embedding techniques represent entire sentences and their semantic information as vectors. This helps the machine in understanding the context, intention, and other nuances in the entire text.

So, what we will do is :
- Importing Libraries
- Defining our model
- Training our data using the model
- Calculating Accuracy


In [11]:
#Importing Libraries and defining our model
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
#Training our data using the model
embed1 = model.encode(cor1, convert_to_tensor=True)
embed2 = model.encode(cor2, convert_to_tensor=True)

Batches:   0%|          | 0/12634 [00:00<?, ?it/s]

Batches:   0%|          | 0/12634 [00:00<?, ?it/s]

In [18]:
#Here making the similarties in a list to include in the data frame or our data set
similarity=[]

for i in range(len(embed1)):
    score=util.cos_sim(embed1[i],embed2[i])
    similarity.append(score[0].item())

In [20]:
#Adding the similarties in a columns beside the data frame
train_df['Similarity'] = similarity

In [21]:
#Here to see the similarity column
train_df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Similarity
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.912277
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.655141
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.515561
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.104022
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.325348
...,...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,0.653643
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1,0.915327
404287,404287,537928,537929,What is one coin?,What's this coin?,0,0.725527
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,-0.066817


In [23]:
#Adding Prediction List beside the similarity column, as if it is higher than 0.75, it will return 1 which mean "true they are similar" , if less than , it will return 0 which means"no they aren't"
pred = []
for i in range(len(similarity)):
    if similarity[i] < 0.75 : 
        similarity[i] = 0
    elif similarity[i] > 0.75:
        similarity[i] = 1
    pred.append(similarity[i])
train_df['pred'] = pred
    
        

In [24]:
#Here to check the pred column
train_df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Similarity,pred
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.912277,1
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.655141,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.515561,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.104022,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.325348,0
...,...,...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,0.653643,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1,0.915327,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0,0.725527,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,-0.066817,0


In [31]:
#Calculating Accuracy
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(train_df['is_duplicate'],train_df['pred'])) 
print("This is your accuracy score : ",accuracy_score(train_df['is_duplicate'],train_df['pred']))

              precision    recall  f1-score   support

           0       0.91      0.71      0.79    255024
           1       0.64      0.87      0.74    149263

    accuracy                           0.77    404287
   macro avg       0.77      0.79      0.77    404287
weighted avg       0.81      0.77      0.77    404287

This is your accuracy score :  0.7694113340275597


***Predicting the test data***

here we will use the same steps above to predict the test data..

In [29]:
#Showing our Test data to know the data we have
test_df

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?
...,...,...,...
2345791,2345791,How do Peaks (TV series): Why did Leland kill ...,What is the most study scene in twin peaks?
2345792,2345792,"What does be ""in transit"" mean on FedEx tracking?",How question FedEx packages delivered?
2345793,2345793,What are some famous Romanian drinks (alcoholi...,Can a non-alcoholic restaurant be a huge success?
2345794,2345794,What were the best and worst things about publ...,What are the best and worst things examination...


In [32]:
test_cor1=test_df['question1'].to_list()
test_cor2=test_df['question2'].to_list()

In [33]:
test_embed1 = model.encode(test_cor1, convert_to_tensor=True)
test_embed2 = model.encode(test_cor2, convert_to_tensor=True)

Batches:   0%|          | 0/73307 [00:00<?, ?it/s]

Batches:   0%|          | 0/73307 [00:00<?, ?it/s]

In [None]:
test_similarity=[]

for i in range(len(test_embed1)):
    score=util.cos_sim(test_embed1[i],test_embed2[i])
    test_similarity.append(score[0].item())

test_df['Similarity'] = test_similarity

In [37]:
test_pred = []
for i in range(len(test_similarity)):
    if test_similarity[i] < 0.75 : 
        test_similarity[i] = 0
    elif test_similarity[i] > 0.75:
        test_similarity[i] = 1
    test_pred.append(test_similarity[i])
    
test_df['pred'] = test_pred

In [38]:
test_df

Unnamed: 0,test_id,question1,question2,Similarity,pred
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0.557410,0
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0.815193,1
2,2,What but is the best way to send money from Ch...,What you send money to China?,0.813327,1
3,3,Which food not emulsifiers?,What foods fibre?,0.498818,0
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.544112,0
...,...,...,...,...,...
2345791,2345791,How do Peaks (TV series): Why did Leland kill ...,What is the most study scene in twin peaks?,0.403489,0
2345792,2345792,"What does be ""in transit"" mean on FedEx tracking?",How question FedEx packages delivered?,0.554308,0
2345793,2345793,What are some famous Romanian drinks (alcoholi...,Can a non-alcoholic restaurant be a huge success?,0.310840,0
2345794,2345794,What were the best and worst things about publ...,What are the best and worst things examination...,0.824781,1


For Submission, I will make a new dataframe , with the id of the test set and prediction.

In [42]:
submission_df = test_df[['test_id','pred']]

In [46]:
submission_df.columns= ['test_id', 'is_duplicate']
submission_df.to_csv('submission.csv', index=False)

In [47]:
submission_df

Unnamed: 0,test_id,is_duplicate
0,0,0
1,1,1
2,2,1
3,3,0
4,4,0
...,...,...
2345791,2345791,0
2345792,2345792,0
2345793,2345793,0
2345794,2345794,1
