In [23]:
import numpy as np
import pandas as pd
import spacy
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

# Loading and Preprocessing data

In [24]:
# Loading the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

# Loading dataset
question_data = pd.read_csv('../input/question-pairs-dataset/questions.csv')
question_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


dropng NaN values and comining the question to create a single feature to vectorize

In [25]:
question_data = question_data.dropna()

In [26]:
question_data['combined'] = question_data['question1'] + question_data['question2']
question_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,combined
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely? How can I solve...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar, salt..."


loading propotion of rows from original data

In [27]:
questions = question_data[:10000]
# disabling other components in the pipeline since only the vectorization is conducted
with nlp.disable_pipes():
    vectors = np.array([nlp(question.combined).vector for idx, question in questions.iterrows()])
    
vectors.shape

(10000, 300)

# Training a Model on Document Vectors

LinearSVC is used since it can be trained and load fast and our main objectve is to get cosin similarity. 

In [28]:
X_train, X_test, y_train, y_test = train_test_split(vectors, questions.is_duplicate, 
                                                    test_size=0.1, random_state=1)


model = LinearSVC(random_state=1, dual=False)

model.fit(X_train, y_train)


print(f'Model test accuracy: {model.score(X_test, y_test)*100:.3f}%')

Model test accuracy: 67.700%


# Find the most similar question

Finding the most similar sentances for each sentances given, using the cosine similarity.

In [29]:
q1 = ['Donald trump is the president of USA', 
      'I love samsung phones but they are lagging', 
      'I know fast food is not healthy but tasty']

q2 = ['Tell me more about apple mobile accessories', 
      'Barack obama is a black by birth', 
      ' Pizza is great when served in hot']

In [30]:
questions = q2
# disabling other components in the pipeline since only the vectorization is conducted
with nlp.disable_pipes():
    vectors = np.array([nlp(question).vector for question in questions])
    
vectors.shape

(3, 300)

In [31]:
similarity = []
similar = []

for question in q1:

    def cosine_similarity(a, b):
        return np.dot(a, b)/np.sqrt(a.dot(a)*b.dot(b))

    question_vec = nlp(question).vector

    ## Center the document vectors
    # Calculate the mean for the document vectors
    vec_mean = vectors.mean(axis=0)
    
    # Subtract the mean from the vectors
    centered = vectors - vec_mean

    # Calculating similarities for each document in the dataset
    sims = np.array([cosine_similarity(question_vec - vec_mean, vec) for vec in centered])

    # Get the index for the most similar document
    most_similar = sims.argmax()
    
    # List the results
    similarity.append(sims)
    similar.append(most_similar)

In [32]:
display(similarity)
display(similar)

[array([-0.30900285,  0.32573262, -0.04206667], dtype=float32),
 array([ 0.43282998, -0.32563302, -0.09692521], dtype=float32),
 array([ 0.05160733, -0.36558956,  0.37828922], dtype=float32)]

[1, 0, 2]