### Taking necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_lg')

### I am using Spacy openc source Library for sentence tokenizing and 
# finding similarty, spacy library uses cosing similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/FAQs.csv")
df_faq_test = pd.read_csv("../data/FAQs_test.csv")

In [3]:
df

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey..."
3,Who were his parents?,His father was Hermann Einstein and his mother...
4,Did he have any sisters and brothers?,He had one sister named Maja.
5,Did he marry and have children?,He was married to Mileva Marić between 1903 an...
6,Where did he receive his education?,He received his main education at the followin...
7,When was Albert Einstein awarded the Nobel Pri...,"The Nobel Prize Awarding Institution, the Roya..."
8,Did Albert Einstein attend the Nobel Prize Awa...,The Nobel Prize was announced on 9 November 19...
9,For what did he receive the Nobel Prize?,Einstein was rewarded for his many contributio...


In [4]:
df_questions = df["Question"]
df_answer = df["Answer"]
df_test_questions = df_faq_test["Question"]

In [5]:
df_questions_list = [nlp(row) for row in df_questions]
df_test_questions_list = [nlp(row) for row in df_test_questions]
df_answer_list = [nlp(row) for row in df_answer]

# here nlp variable is a pretrained model. 
# It helps the text to vectorize and in here took a list of questions, answers and test questions using this model

In [6]:
df_questions_list

[When was Albert Einstein born?,
 Where was he born?,
 When did he die?,
 Who were his parents?,
 Did he have any sisters and brothers?,
 Did he marry and have children?,
 Where did he receive his education?,
 When was Albert Einstein awarded the Nobel Prize in Physics?,
 Did Albert Einstein attend the Nobel Prize Award Ceremony?,
 For what did he receive the Nobel Prize?]

In [7]:
# Testing the similarity between the questions using tokenizing 
for token1 in df_test_questions_list:
    for token2 in df_questions_list:
            if token1.similarity(token2)>0.6 and token1.similarity(token2) <0.85:
                print((token1.text,token2.text),"Similarity",token1.similarity(token2))


('What is the date of his death?', 'Where did he receive his education?') Similarity 0.6065511509364785
('Did Einstein have siblings?', 'When did he die?') Similarity 0.6473136742138257
('Did Einstein have siblings?', 'Who were his parents?') Similarity 0.6271320370519711
('Did Einstein have siblings?', 'Did he have any sisters and brothers?') Similarity 0.7630316131006155
('Did Einstein have siblings?', 'Did he marry and have children?') Similarity 0.7607719432144754
('Who was his wife?', 'When was Albert Einstein born?') Similarity 0.6741952411421724
('Who was his wife?', 'Where was he born?') Similarity 0.7922938250010239
('Who was his wife?', 'When did he die?') Similarity 0.6919347098073266
('Who was his wife?', 'Did he have any sisters and brothers?') Similarity 0.6730928040848374
('Who was his wife?', 'Did he marry and have children?') Similarity 0.7232488376712831
('Who was his wife?', 'Where did he receive his education?') Similarity 0.7769483705620077
('Who was his wife?', 'F

In [8]:
# Now take using list comprehension i can make this data useful for precessing
question_list = [(token1.text,token2.text,token1.similarity(token2)) for token1 in df_test_questions_list for token2 in df_questions_list if token1.similarity(token2)>0.58 and token1.similarity(token2) <.85]

In [9]:
question_list

[('What is the date of his death?',
  'Who were his parents?',
  0.5951033086160062),
 ('What is the date of his death?',
  'Where did he receive his education?',
  0.6065511509364785),
 ('Did Einstein have siblings?', 'When did he die?', 0.6473136742138257),
 ('Did Einstein have siblings?', 'Who were his parents?', 0.6271320370519711),
 ('Did Einstein have siblings?',
  'Did he have any sisters and brothers?',
  0.7630316131006155),
 ('Did Einstein have siblings?',
  'Did he marry and have children?',
  0.7607719432144754),
 ('Did Einstein have siblings?',
  'Where did he receive his education?',
  0.5914591444349934),
 ('Who was his wife?', 'When was Albert Einstein born?', 0.6741952411421724),
 ('Who was his wife?', 'Where was he born?', 0.7922938250010239),
 ('Who was his wife?', 'When did he die?', 0.6919347098073266),
 ('Who was his wife?',
  'Did he have any sisters and brothers?',
  0.6730928040848374),
 ('Who was his wife?', 'Did he marry and have children?', 0.723248837671283

In [10]:
#Making daraframe for the similar questions with similarity scores
df_modified_question_list = pd.DataFrame(question_list)
df_modified_question_list

Unnamed: 0,0,1,2
0,What is the date of his death?,Who were his parents?,0.595103
1,What is the date of his death?,Where did he receive his education?,0.606551
2,Did Einstein have siblings?,When did he die?,0.647314
3,Did Einstein have siblings?,Who were his parents?,0.627132
4,Did Einstein have siblings?,Did he have any sisters and brothers?,0.763032
5,Did Einstein have siblings?,Did he marry and have children?,0.760772
6,Did Einstein have siblings?,Where did he receive his education?,0.591459
7,Who was his wife?,When was Albert Einstein born?,0.674195
8,Who was his wife?,Where was he born?,0.792294
9,Who was his wife?,When did he die?,0.691935


In [11]:
#Giving the columns their new names
df_modified_question_list.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']

In [12]:
# Seperate The test questions based on similarities
df_s1 = df_modified_question_list.loc[df_modified_question_list['FaTestQuestions'] == 'What is the date of his death?'].values
df_s1 = pd.DataFrame(df_s1)

df_s2 = df_modified_question_list.loc[df_modified_question_list['FaTestQuestions'] == 'Did Einstein have siblings?'].values
df_s2 = pd.DataFrame(df_s2)

df_s3 = df_modified_question_list.loc[df_modified_question_list['FaTestQuestions'] == 'Who was his wife?'].values
df_s3 = pd.DataFrame(df_s3)

df_s4 = df_modified_question_list.loc[df_modified_question_list['FaTestQuestions'] == "What was Einstein's father's name?"].values
df_s4 = pd.DataFrame(df_s4)

df_s5 = df_modified_question_list.loc[df_modified_question_list['FaTestQuestions'] == 'At what institutions did he study?'].values
df_s5 = pd.DataFrame(df_s5)

In [13]:
df_s1.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']
df_s2.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']
df_s3.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']
df_s4.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']
df_s5.columns = ['FaTestQuestions','FaQuestions','SimilarityScore']

In [14]:
# Inserting Answer for each test data and create new dataset based on similarity scores
df_s1.insert(3, "Answer", "")
df_s1

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,What is the date of his death?,Who were his parents?,0.595103,
1,What is the date of his death?,Where did he receive his education?,0.606551,


In [15]:
df_s2.insert(3, "Answer", df['Answer'][4])
df_s2

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,Did Einstein have siblings?,When did he die?,0.647314,He had one sister named Maja.
1,Did Einstein have siblings?,Who were his parents?,0.627132,He had one sister named Maja.
2,Did Einstein have siblings?,Did he have any sisters and brothers?,0.763032,He had one sister named Maja.
3,Did Einstein have siblings?,Did he marry and have children?,0.760772,He had one sister named Maja.
4,Did Einstein have siblings?,Where did he receive his education?,0.591459,He had one sister named Maja.


In [16]:
df_s3.insert(3, "Answer", df['Answer'][5])
df_s3

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,Who was his wife?,When was Albert Einstein born?,0.674195,He was married to Mileva Marić between 1903 an...
1,Who was his wife?,Where was he born?,0.792294,He was married to Mileva Marić between 1903 an...
2,Who was his wife?,When did he die?,0.691935,He was married to Mileva Marić between 1903 an...
3,Who was his wife?,Did he have any sisters and brothers?,0.673093,He was married to Mileva Marić between 1903 an...
4,Who was his wife?,Did he marry and have children?,0.723249,He was married to Mileva Marić between 1903 an...
5,Who was his wife?,Where did he receive his education?,0.776948,He was married to Mileva Marić between 1903 an...
6,Who was his wife?,For what did he receive the Nobel Prize?,0.6064,He was married to Mileva Marić between 1903 an...


In [17]:
df_s4.insert(3, "Answer", df['Answer'][3])
df_s4

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,What was Einstein's father's name?,When was Albert Einstein born?,0.655847,His father was Hermann Einstein and his mother...
1,What was Einstein's father's name?,Where was he born?,0.63054,His father was Hermann Einstein and his mother...
2,What was Einstein's father's name?,Who were his parents?,0.618958,His father was Hermann Einstein and his mother...
3,What was Einstein's father's name?,Where did he receive his education?,0.590276,His father was Hermann Einstein and his mother...


In [18]:
df_s5.insert(3, "Answer", df['Answer'][6])
df_s5

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,At what institutions did he study?,Where was he born?,0.632309,He received his main education at the followin...
1,At what institutions did he study?,When did he die?,0.729897,He received his main education at the followin...
2,At what institutions did he study?,Did he have any sisters and brothers?,0.695282,He received his main education at the followin...
3,At what institutions did he study?,Did he marry and have children?,0.718233,He received his main education at the followin...
4,At what institutions did he study?,Where did he receive his education?,0.747372,He received his main education at the followin...
5,At what institutions did he study?,For what did he receive the Nobel Prize?,0.738179,He received his main education at the followin...


In [19]:
# Now Creating new Dataset consisting of the similar questions along with the answers

df_new = pd.concat([df_s1,df_s2,df_s3,df_s4,df_s5])
df_new

Unnamed: 0,FaTestQuestions,FaQuestions,SimilarityScore,Answer
0,What is the date of his death?,Who were his parents?,0.595103,
1,What is the date of his death?,Where did he receive his education?,0.606551,
0,Did Einstein have siblings?,When did he die?,0.647314,He had one sister named Maja.
1,Did Einstein have siblings?,Who were his parents?,0.627132,He had one sister named Maja.
2,Did Einstein have siblings?,Did he have any sisters and brothers?,0.763032,He had one sister named Maja.
3,Did Einstein have siblings?,Did he marry and have children?,0.760772,He had one sister named Maja.
4,Did Einstein have siblings?,Where did he receive his education?,0.591459,He had one sister named Maja.
0,Who was his wife?,When was Albert Einstein born?,0.674195,He was married to Mileva Marić between 1903 an...
1,Who was his wife?,Where was he born?,0.792294,He was married to Mileva Marić between 1903 an...
2,Who was his wife?,When did he die?,0.691935,He was married to Mileva Marić between 1903 an...


In [20]:
# Showing answers for each test questions that are given.
df_new.loc[df_new['FaTestQuestions'] == 'What is the date of his death?']["Answer"].values[0]
# no possible matching for this, so no answer shown

''

In [21]:
df_new.loc[df_new['FaTestQuestions'] == 'Did Einstein have siblings?']["Answer"].values[0]

'He had one sister named Maja.'

In [22]:
df_new.loc[df_new['FaTestQuestions'] == 'Who was his wife?']["Answer"].values[0]

'He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.'

In [23]:
df_new.loc[df_new['FaTestQuestions'] == "What was Einstein's father's name?"]["Answer"].values[0]

'His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).'

In [24]:
df_new.loc[df_new['FaTestQuestions'] == 'At what institutions did he study?']["Answer"].values[0]

'He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)'

In [26]:
# Also the similarity scores shows the matching questions as well.