# Question-Answer System

#### Import necessary packages

In [74]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import re
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api

#### Read text files into dataframes

In [75]:
df1 = pd.read_csv('S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [76]:
df1

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
...,...,...,...,...,...,...
1710,Woodrow_Wilson,Was Wilson president of the American Political...,Yes,,easy,S08_set3_a8
1711,Woodrow_Wilson,Did he not cast his ballot for John M. Palmer ...,Yes,,easy,S08_set3_a8
1712,Woodrow_Wilson,Did Wilson not spend 1914 through the beginnin...,Yes,,easy,S08_set3_a8
1713,Woodrow_Wilson,"Was Wilson , a staunch opponent of antisemitis...",Yes,,easy,S08_set3_a8


#### Combine the three dataframes

In [77]:
data=df1.append([df2,df3])

In [78]:
data.shape

(3998, 6)

#### Question,Answer columns retrieved

In [79]:
columns=['Question','Answer']
data=data.loc[:,columns]

In [80]:
data.head()

Unnamed: 0,Question,Answer
0,Was Abraham Lincoln the sixteenth President of...,yes
1,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Did Lincoln sign the National Banking Act of 1...,yes
3,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Did his mother die of pneumonia?,no


#### Remove duplicate questions

In [81]:
#remove duplicate column 
data =data.drop_duplicates(subset='Question')

In [82]:
data.shape

(2457, 2)

#### Deleting those rows that doesnt have a question or answer

In [83]:
data.isnull().sum()

Question      1
Answer      272
dtype: int64

In [84]:
data = data.dropna(axis=0)

In [85]:
data.shape

(2185, 2)

#### Clean a sentence by converting to lowercase,remove any letter other than alphabets/numbers and then removing stopwords if needed

In [86]:
def clean_sentence(sentence,stopwords=False):
    sentence=sentence.lower().strip()
    sentence=re.sub(r'[^a-z0-9\s]','',sentence)
    if (stopwords):
        sentence=remove_stopwords(sentence)
    return(sentence)

#### Given the complete data,method to clean data

In [87]:
def get_cleaned_sentences(df,stopwords=False):
    sents=df[["Question"]]
    cleaned_sentences=[]
    for index,row in df.iterrows():
        cleaned=clean_sentence(row['Question'],stopwords)
        cleaned_sentences.append(cleaned)
    return(cleaned_sentences)

#### Cleaning the complete data and removing stopwords

In [88]:
cleaned_sentence=get_cleaned_sentences(data,stopwords=True)
print(cleaned_sentence)

['abraham lincoln sixteenth president united states', 'lincoln sign national banking act 1863', 'mother die pneumonia', 'long lincolns formal education', 'lincoln begin political career', 'legal tender act 1862 establish', 'suggested lincoln grow beard', 'gettysburg address argue america born', 'lincoln beat john c breckinridge 1860 election', 'abraham lincoln president united states', 'lincoln start political career 1832', 'lincoln represent alton sangamon railroad', 'county lincoln born', 'lincoln serve president', 'assassinated lincoln', 'lincoln win election 1860', 'general charge battle antietam', 'lincoln issue emancipation proclamation', 'noted contributions theory molarity molecular weight', 'graduated ecclesiastical law early age 20 began practice', 'publish memoria', 'professor', 'true professor 1820', 'lorenzo romano amedeo carlo avogadro italian savant', 'amedeo avogadro born turin august 9th 1776 noble ancient family piedmont italy', 'noted contributions theory molarity mo

#### Cleaning the complete data and keeping stopwords

In [89]:
cleaned_sentence_sp=get_cleaned_sentences(data,stopwords=False)
print(cleaned_sentence_sp)

['was abraham lincoln the sixteenth president of the united states', 'did lincoln sign the national banking act of 1863', 'did his mother die of pneumonia', 'how many long was lincolns formal education', 'when did lincoln begin his political career', 'what did the legal tender act of 1862 establish', 'who suggested lincoln grow a beard', 'when did the gettysburg address argue that america was born', 'did lincoln beat john c breckinridge in the 1860 election', 'was abraham lincoln the first president of the united states', 'did lincoln start his political career in 1832', 'did lincoln ever represent alton  sangamon railroad', 'which county was lincoln born in', 'when did lincoln first serve as president', 'who assassinated lincoln', 'did lincoln win the election of 1860', 'who was the general in charge at the battle of antietam', 'why did lincoln issue the emancipation proclamation', 'who is most noted for his contributions to the theory of molarity and molecular weight', 'who graduated

## Bag of words model

#### For each unique word in the document a key(numeric identifier) assigned

In [90]:
sentences=cleaned_sentence_sp
sentence_words=[[word for word in document.split()] for document in sentences]
dictionary=corpora.Dictionary(sentence_words)
for key,value in dictionary.items():
    print(key," : ",value)

0  :  abraham
1  :  lincoln
2  :  of
3  :  president
4  :  sixteenth
5  :  states
6  :  the
7  :  united
8  :  was
9  :  1863
10  :  act
11  :  banking
12  :  did
13  :  national
14  :  sign
15  :  die
16  :  his
17  :  mother
18  :  pneumonia
19  :  education
20  :  formal
21  :  how
22  :  lincolns
23  :  long
24  :  many
25  :  begin
26  :  career
27  :  political
28  :  when
29  :  1862
30  :  establish
31  :  legal
32  :  tender
33  :  what
34  :  a
35  :  beard
36  :  grow
37  :  suggested
38  :  who
39  :  address
40  :  america
41  :  argue
42  :  born
43  :  gettysburg
44  :  that
45  :  1860
46  :  beat
47  :  breckinridge
48  :  c
49  :  election
50  :  in
51  :  john
52  :  first
53  :  1832
54  :  start
55  :  alton
56  :  ever
57  :  railroad
58  :  represent
59  :  sangamon
60  :  county
61  :  which
62  :  as
63  :  serve
64  :  assassinated
65  :  win
66  :  antietam
67  :  at
68  :  battle
69  :  charge
70  :  general
71  :  emancipation
72  :  issue
73  :  proclamati

674  :  working
675  :  forested
676  :  prefer
677  :  they
678  :  regulation
679  :  very
680  :  recorded
681  :  shot
682  :  smaller
683  :  abnormality
684  :  rare
685  :  tusklessness
686  :  1947
687  :  1948
688  :  signed
689  :  treaties
690  :  ukraine
691  :  balticfinnic
692  :  finnish
693  :  subgroup
694  :  uralic
695  :  finland
696  :  newspaper
697  :  read
698  :  taloussanomat
699  :  1809
700  :  army
701  :  conquered
702  :  russian
703  :  expectancy
704  :  life
705  :  men
706  :  games
707  :  held
708  :  olympiad
709  :  xv
710  :  body
711  :  lies
712  :  south
713  :  conservation
714  :  due
715  :  growing
716  :  recent
717  :  cell
718  :  people
719  :  phone
720  :  subscriptions
721  :  matti
722  :  minister
723  :  prime
724  :  vanhanen
725  :  capital
726  :  city
727  :  oslo
728  :  norway
729  :  biggest
730  :  popular
731  :  rock
732  :  economy
733  :  finlands
734  :  located
735  :  weather
736  :  enthusiasts
737  :  favorite
73

1297  :  also
1298  :  developed
1299  :  discovered
1300  :  exposed
1301  :  less
1302  :  percent
1303  :  virus
1304  :  different
1305  :  cars
1306  :  dazzled
1307  :  engine
1308  :  front
1309  :  headlights
1310  :  known
1311  :  leap
1312  :  noise
1313  :  startled
1314  :  capable
1315  :  collision
1316  :  vehicle
1317  :  ball
1318  :  game
1319  :  grook
1320  :  kurnai
1321  :  marn
1322  :  using
1323  :  balance
1324  :  feet
1325  :  hind
1326  :  leaping
1327  :  legs
1328  :  muscular
1329  :  powerful
1330  :  tail
1331  :  foot
1332  :  macropodidae
1333  :  macropods
1334  :  marsupial
1335  :  better
1336  :  changes
1337  :  leopard
1338  :  members
1339  :  panthera
1340  :  cheetah
1341  :  lanky
1342  :  leopards
1343  :  pythons
1344  :  mistaken
1345  :  hybrid
1346  :  puma
1347  :  resulting
1348  :  union
1349  :  kills
1350  :  distinguish
1351  :  naturalists
1352  :  panthers
1353  :  colloquially
1354  :  dark
1355  :  ros08settes
1356  :  felid

1898  :  156
1899  :  2001
1900  :  contribute
1901  :  gdp
1902  :  media
1903  :  bus
1904  :  daily
1905  :  network
1906  :  laut
1907  :  orang
1908  :  mrt
1909  :  characterized
1910  :  climate
1911  :  contribution
1912  :  increase
1913  :  medias
1914  :  seeking
1915  :  days
1916  :  imperial
1917  :  singapore
1918  :  korean
1919  :  penisula
1920  :  tip
1921  :  citystate
1922  :  nila
1923  :  sang
1924  :  utama
1925  :  singa
1926  :  invaded
1927  :  britains
1928  :  churchill
1929  :  defeat
1930  :  greatest
1931  :  janpan
1932  :  singapor
1933  :  kuan
1934  :  lee
1935  :  successful
1936  :  yew
1937  :  chok
1938  :  goh
1939  :  succeed
1940  :  tong
1941  :  source
1942  :  supply
1943  :  garden
1944  :  orchid
1945  :  raffles
1946  :  sir
1947  :  stamford
1948  :  firstdegree
1949  :  murder
1950  :  punishment
1951  :  1819
1952  :  island
1953  :  post
1954  :  trading
1955  :  route
1956  :  strategic
1957  :  centres
1958  :  commercial
1959  :  

2523  :  copenhagen
2524  :  bridge
2525  :  transnational
2526  :  hub
2527  :  1251
2528  :  1255
2529  :  surrounded
2530  :  things
2531  :  occupy
2532  :  bacchanal
2533  :  cymbals
2534  :  historically
2535  :  suggest
2536  :  origins
2537  :  traced
2538  :  moden
2539  :  orchestras
2540  :  orchestral
2541  :  accompanied
2542  :  clash
2543  :  cymbal
2544  :  hit
2545  :  accentuate
2546  :  note
2547  :  dynamic
2548  :  enormous
2549  :  expert
2550  :  obtain
2551  :  player
2552  :  range
2553  :  back
2554  :  hihat
2555  :  derived
2556  :  germanic
2557  :  crash
2558  :  drum
2559  :  instruments
2560  :  kit
2561  :  greek
2562  :  mythical
2563  :  technically
2564  :  bangladesh
2565  :  dhaka
2566  :  banks
2567  :  buriganga
2568  :  eastern
2569  :  cover
2570  :  1970
2571  :  bhola
2572  :  cyclone
2573  :  arrive
2574  :  mughals
2575  :  speakers
2576  :  intonation
2577  :  israel
2578  :  jure
2579  :  stressed
2580  :  syllable
2581  :  grammar
2582  

3098  :  130
3099  :  ancestors
3100  :  diversified
3101  :  flowering
3102  :  midcretaceous
3103  :  rise
3104  :  wasplike
3105  :  easily
3106  :  elbowed
3107  :  identified
3108  :  nodelike
3109  :  slender
3110  :  structure
3111  :  waist
3112  :  antwerp
3113  :  municipality
3114  :  fashion
3115  :  semitic
3116  :  alongside
3117  :  classified
3118  :  calligraphy
3119  :  invented
3120  :  hassaniya
3121  :  mauritania
3122  :  acquired
3123  :  numerals
3124  :  academies
3125  :  genre
3126  :  hassan
3127  :  massoudy
3128  :  berlin
3129  :  germany
3130  :  charlottenburg
3131  :  existing
3132  :  palace
3133  :  schloss
3134  :  building
3135  :  site
3136  :  events
3137  :  olympiastadion
3138  :  berliner
3139  :  dom
3140  :  1990s
3141  :  architect
3142  :  foster
3143  :  norman
3144  :  remodel
3145  :  cleaner
3146  :  reliance
3147  :  renewable
3148  :  shifting
3149  :  sources
3150  :  towards
3151  :  vattenfall
3152  :  zoos
3153  :  springer
3154 

3586  :  15
3587  :  studio
3588  :  verroccios
3589  :  author
3590  :  baldassare
3591  :  castiglione
3592  :  cortegiano
3593  :  courtier
3594  :  il
3595  :  flee
3596  :  venice
3597  :  botticelli
3598  :  contemporary
3599  :  domenico
3600  :  ghirlandaio
3601  :  perugino
3602  :  attributed
3603  :  archetype
3604  :  curiosity
3605  :  equaled
3606  :  renaissance
3607  :  unquenchable
3608  :  diversely
3609  :  painters
3610  :  talented
3611  :  depth
3612  :  gardner
3613  :  helen
3614  :  himself
3615  :  historian
3616  :  interests
3617  :  mind
3618  :  mysterious
3619  :  personality
3620  :  precedent
3621  :  remote
3622  :  scope
3623  :  seem
3624  :  superhuman
3625  :  invertebrates
3626  :  lobsters
3627  :  kosher
3628  :  mercury
3629  :  pain
3630  :  moche
3631  :  worship
3632  :  caridoid
3633  :  escape
3634  :  reaction
3635  :  greeks
3636  :  recitations
3637  :  hollow
3638  :  constellation
3639  :  resemble
3640  :  said
3641  :  extending
364

4088  :  absolute
4089  :  lewis
4090  :  nominative
4091  :  phonetic
4092  :  exception
4093  :  harmony
4094  :  patterns
4095  :  comparatively
4096  :  vietnam
4097  :  clusters
4098  :  consonant
4099  :  formally
4100  :  writing
4101  :  reunification
4102  :  borrow
4103  :  2nd
4104  :  bc
4105  :  predominate
4106  :  britishaustralian
4107  :  circle
4108  :  frequented
4109  :  russell
4110  :  1913
4111  :  annotated
4112  :  goghbonger
4113  :  johanna
4114  :  theos
4115  :  van
4116  :  widow
4117  :  eightyearold
4118  :  gogh
4119  :  kee
4120  :  37
4121  :  anxiety
4122  :  gunshot
4123  :  illness
4124  :  mental
4125  :  selfinflicted
4126  :  unknown
4127  :  wound
4128  :  appreciated
4129  :  contributor
4130  :  foundations
4131  :  historys
4132  :  shoulder
4133  :  hourglass
4134  :  vitula
4135  :  repairs
4136  :  voice
4137  :  1980s
4138  :  boyd
4139  :  tinsley
4140  :  violinist
4141  :  antonio
4142  :  stradivari
4143  :  24
4144  :  amati
4145  :

#### For each sentence corresponding Bag of word vectors stored

In [91]:
bow_corpus=[dictionary.doc2bow(text) for text in sentence_words]
for sent,embedding in zip(sentences,bow_corpus):
    print(sent)
    print(embedding)

was abraham lincoln the sixteenth president of the united states
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1)]
did lincoln sign the national banking act of 1863
[(1, 1), (2, 1), (6, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]
did his mother die of pneumonia
[(2, 1), (12, 1), (15, 1), (16, 1), (17, 1), (18, 1)]
how many long was lincolns formal education
[(8, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
when did lincoln begin his political career
[(1, 1), (12, 1), (16, 1), (25, 1), (26, 1), (27, 1), (28, 1)]
what did the legal tender act of 1862 establish
[(2, 1), (6, 1), (10, 1), (12, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]
who suggested lincoln grow a beard
[(1, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]
when did the gettysburg address argue that america was born
[(6, 1), (8, 1), (12, 1), (28, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)]
did lincoln beat john c breckinridge in the 1860 election
[(1, 1), (

did the election of 1800 not become a bitter and volatile battle  with each side expressing extraordinary fear of the other party and its policies 
[(2, 2), (6, 2), (12, 1), (34, 1), (49, 1), (68, 1), (75, 2), (98, 1), (139, 1), (161, 1), (239, 1), (301, 1), (583, 1), (956, 1), (1224, 1), (1243, 1), (1244, 1), (1245, 1), (1246, 1), (1247, 1), (1248, 1), (1249, 1)]
what happened in 1764
[(33, 1), (50, 1), (154, 1), (1250, 1)]
is a kangaroo a reptile
[(34, 2), (78, 1), (1251, 1), (1252, 1)]
is a kangaroo on the australian coat of arms
[(2, 1), (6, 1), (34, 1), (78, 1), (187, 1), (1251, 1), (1253, 1), (1254, 1), (1255, 1)]
was james cook the first to record the name kangooroo
[(6, 2), (8, 1), (52, 1), (84, 1), (329, 1), (1111, 1), (1223, 1), (1256, 1), (1257, 1)]
what is a collective noun for kangaroos
[(33, 1), (34, 1), (77, 1), (78, 1), (1258, 1), (1259, 1), (1260, 1)]
where do joeys complete postnatal development
[(215, 1), (286, 1), (358, 1), (1261, 1), (1262, 1), (1263, 1)]
what do k

can a panther be any of several species of large felid
[(2, 2), (34, 1), (143, 1), (144, 1), (247, 1), (310, 1), (472, 1), (1294, 1), (1404, 1), (1405, 1)]
is the leopard an agile and graceful predator
[(6, 1), (75, 1), (78, 1), (105, 1), (1337, 1), (1406, 1), (1407, 1), (1408, 1)]
is liechtenstein bordered by switzerland
[(78, 1), (274, 1), (592, 1), (1409, 1), (1410, 1)]
is liechtenstein the smallest germanspeaking country in the world
[(6, 2), (50, 1), (78, 1), (445, 1), (447, 1), (1402, 1), (1409, 1), (1411, 1)]
was liechtenstein part of the ancient roman province of raetia
[(2, 2), (6, 1), (8, 1), (114, 1), (443, 1), (1409, 1), (1412, 1), (1413, 1), (1414, 1)]
when was the first factory opened
[(6, 1), (8, 1), (28, 1), (52, 1), (1415, 1), (1416, 1)]
how many municipalities is liechtenstein divided into
[(21, 1), (24, 1), (78, 1), (147, 1), (856, 1), (1409, 1), (1417, 1)]
what is the national currency of liechtenstein
[(2, 1), (6, 1), (13, 1), (33, 1), (78, 1), (1409, 1), (1418, 1)

what is the highest mountain in romania
[(6, 1), (33, 1), (50, 1), (78, 1), (1447, 1), (1846, 1), (1850, 1)]
what is the official language of romania
[(2, 1), (6, 1), (33, 1), (78, 1), (401, 1), (402, 1), (1846, 1)]
is the romanian economy doing well
[(6, 1), (78, 1), (732, 1), (1275, 1), (1851, 1), (1852, 1)]
are there many roma in romania
[(24, 1), (50, 1), (126, 1), (208, 1), (1846, 1), (1853, 1)]
how many territories joined to form romania
[(21, 1), (24, 1), (84, 1), (281, 1), (439, 1), (1846, 1), (1854, 1)]
does romania share a border with ukraine
[(34, 1), (161, 1), (437, 1), (631, 1), (690, 1), (878, 1), (1846, 1)]
did romania declare neutrality during world war i
[(12, 1), (304, 1), (447, 1), (602, 1), (1484, 1), (1846, 1), (1855, 1), (1856, 1)]
does romania share the same language with moldova
[(6, 1), (161, 1), (401, 1), (437, 1), (878, 1), (1846, 1), (1857, 1), (1858, 1)]
who was the first gymnast to score a perfect ten
[(6, 1), (8, 1), (34, 1), (38, 1), (52, 1), (84, 1), (1

do arabs consider literary arabic
[(215, 1), (1559, 1), (1789, 1), (2354, 1), (2355, 1)]
the sociolinguistic situation of arabic provides a prime example of what
[(2, 2), (6, 1), (33, 1), (34, 1), (723, 1), (1789, 1), (2356, 1), (2357, 1), (2358, 1), (2359, 1)]
are pronouns in literary arabic marked for person  number and gender 
[(50, 1), (75, 1), (77, 1), (131, 1), (208, 1), (1559, 1), (1789, 1), (2360, 1), (2361, 1), (2362, 1), (2363, 1)]
however  are nonhuman plural nouns grammatically considered to be feminine singular 
[(84, 1), (143, 1), (208, 1), (245, 1), (1466, 1), (2364, 1), (2365, 1), (2366, 1), (2367, 1), (2368, 1), (2369, 1)]
is a bee an insect
[(34, 1), (78, 1), (105, 1), (237, 1), (2370, 1)]
are bees related to ants
[(84, 1), (208, 1), (788, 1), (2371, 1), (2372, 1)]
have managed populations of european honey bees experienced substantial declines
[(2, 1), (298, 1), (457, 1), (660, 1), (2372, 1), (2373, 1), (2374, 1), (2375, 1), (2376, 1), (2377, 1)]
where do bees live
[

[(12, 1), (33, 1), (1530, 1), (2050, 1), (2691, 1), (2797, 1)]
what did the university of oxford grant faraday
[(2, 1), (6, 1), (12, 1), (33, 1), (770, 1), (2094, 1), (2791, 1), (2798, 1)]
who was made to travel outside the coach
[(6, 1), (8, 1), (38, 1), (84, 1), (546, 1), (2465, 1), (2739, 1), (2799, 1)]
did faraday construct the ancestor of modern power generators
[(2, 1), (6, 1), (12, 1), (203, 1), (2791, 1), (2800, 1), (2801, 1), (2802, 1), (2803, 1)]
did faraday lecture on education in 1854
[(12, 1), (19, 1), (50, 1), (187, 1), (2791, 1), (2804, 1), (2805, 1)]
did faraday receive an extensive formal education
[(12, 1), (19, 1), (20, 1), (105, 1), (618, 1), (2791, 1), (2806, 1)]
did faraday discover the chemical substance benzene
[(6, 1), (12, 1), (128, 1), (1046, 1), (2691, 1), (2791, 1), (2807, 1)]
was faraday born in england
[(8, 1), (42, 1), (50, 1), (2314, 1), (2791, 1)]
what do some historians of science refer to faraday as
[(2, 1), (33, 1), (62, 1), (84, 1), (215, 1), (504,

was charlesaugustin de coulomb ever employed at la rochelle
[(8, 1), (56, 1), (67, 1), (409, 1), (2467, 1), (2468, 1), (3208, 1), (3209, 1), (3210, 1)]
what is charlesaugustin de coulomb best known for
[(33, 1), (77, 1), (78, 1), (409, 1), (745, 1), (1310, 1), (2467, 1), (2468, 1)]
whose ideas inspired charlesaugustin de coulombs experiments on the resistance of masonries
[(2, 1), (6, 1), (187, 1), (191, 1), (409, 1), (2075, 1), (2427, 1), (2467, 1), (2476, 1), (2759, 1), (3211, 1), (3212, 1)]
when did charlesaugustin de coulomb join his fathers family in montpeillier
[(12, 1), (16, 1), (28, 1), (50, 1), (116, 1), (409, 1), (1116, 1), (2221, 1), (2467, 1), (2468, 1), (3213, 1)]
did charlesaugustin de coulomb come from a poor family
[(12, 1), (34, 1), (116, 1), (206, 1), (409, 1), (489, 1), (2165, 1), (2467, 1), (2468, 1)]
what is coulombs law
[(33, 1), (78, 1), (92, 1), (2476, 1)]
where is collge des quatrenations located
[(78, 1), (358, 1), (734, 1), (1020, 1), (3214, 1), (3215, 1)]
w

could malay have originated from sumatra island
[(206, 1), (298, 1), (1068, 1), (1952, 1), (1987, 1), (2414, 1), (3646, 1)]
is malay an agglutinative language
[(78, 1), (105, 1), (401, 1), (1987, 1), (3647, 1)]
what family is malay a member of
[(2, 1), (33, 1), (34, 1), (78, 1), (116, 1), (377, 1), (1987, 1)]
what is the basic word order in malay
[(6, 1), (33, 1), (50, 1), (78, 1), (509, 1), (1657, 1), (1987, 1), (3303, 1)]
what languages has malay borrowed words from
[(33, 1), (206, 1), (420, 1), (477, 1), (1238, 1), (1987, 1), (2350, 1)]
are there many words in malay that use natural gender
[(24, 1), (44, 1), (50, 1), (126, 1), (208, 1), (507, 1), (1238, 1), (1276, 1), (1987, 1), (2360, 1)]
how are new words formed in malay
[(21, 1), (50, 1), (208, 1), (297, 1), (925, 1), (1238, 1), (1987, 1)]
is malay in the austronesian family of languages
[(2, 1), (6, 1), (50, 1), (78, 1), (116, 1), (477, 1), (1987, 1), (3648, 1)]
is there one negation word in malay
[(50, 1), (78, 1), (126, 1), (4

#### Each provided question would be converted to corresponding Bag of vector after cleaning 

In [92]:
original_ques="When Abraham Lincoln started his political career?"
question=clean_sentence(original_ques,stopwords=False)
question_embedding=dictionary.doc2bow(question.split())
print(original_ques,question_embedding,sep="\n")

When Abraham Lincoln started his political career?
[(0, 1), (1, 1), (16, 1), (26, 1), (27, 1), (28, 1)]


#### Method to retrieve question and answer after comparing cosine similarity of question and each of the questions in database,that with most cosine similarity would be selected and answer displayed

In [93]:
def retrieve_answer(question_embedding,sentence_embeddings,df,sentences):
    max_sim=-1
    index_sim=-1
    for index,faq_embedding in enumerate(sentence_embeddings):
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0]
        print(index,sim,sentences[index])
        if (sim>max_sim):
            max_sim=sim
            index_sim=index
    #max_similarity=np.argmax(sim, axis=None)
    print("Question :",question)
    print("Retreived Question:",df.iloc[index_sim,0])
    print("Retreived Answer:",df.iloc[index_sim,1])

In [94]:
retrieve_answer(question_embedding,bow_corpus,data,sentences)

0 1.0 was abraham lincoln the sixteenth president of the united states
1 0.7071067811865475 did lincoln sign the national banking act of 1863
2 0.4472135954999579 did his mother die of pneumonia
3 0.12403473458920847 how many long was lincolns formal education
4 0.7071067811865475 when did lincoln begin his political career
5 0.4472135954999579 what did the legal tender act of 1862 establish
6 0.7071067811865475 who suggested lincoln grow a beard
7 0.1643989873053573 when did the gettysburg address argue that america was born
8 0.7071067811865475 did lincoln beat john c breckinridge in the 1860 election
9 1.0 was abraham lincoln the first president of the united states
10 0.7071067811865475 did lincoln start his political career in 1832
11 0.7071067811865475 did lincoln ever represent alton  sangamon railroad
12 0.7071067811865475 which county was lincoln born in
13 0.7071067811865475 when did lincoln first serve as president
14 0.7071067811865475 who assassinated lincoln
15 0.70710678

156 0.004807636746383068 are adult ducks fast fliers
157 0.01999600119960014 are ducks an accepted presence in some populated areas
158 0.4472135954999579 what types of unrelated water birds with similar forms are ducks sometimes confused with
159 0.013512279797026393 why are ducklings particularly vulnerable
160 0.030289126640769135 what are some economic uses for duck
161 0.7071067811865475 what allows a duck to filter water out of the side of their beaks and keep food inside
162 0.4472135954999579 what expression is part of a conceptual framework for testing computer systems
163 0.1643989873053573 where did the expression a sitting duck come from
164 0.1643989873053573 is egypt bordered by the gaza strip
165 0.1643989873053573 is egypt the most populated country in africa
166 0.1643989873053573 does egypt have political influence in the middle east
167 0.1643989873053573 when was the six day war
168 0.030289126640769135 what religions has egypt outlawed
169 0.4472135954999579 what i

277 0.1643989873053573 what does the word ghana mean
278 0.12403473458920847 who was kwame nkrumah
279 0.01999600119960014 where is old ghana in relation to present ghana
280 0.02630668208823282 who makes up ghanas parliament
281 0.19611613513818404 what european countries established states in ghana
282 0.31622776601683794 does the gray wolf share an ancestry with the domestic dog
283 0.011903918403013953 do wolf pups tend to have darker fur than adults
284 0.004651112481478345 do wolves leave their pack
285 0.4472135954999579 what kinds of coats do wolves have
286 0.004651112481478345 where do wolves have scent glands
287 0.047565149415449405 how much do wolves weigh
288 0.013512279797026393 why do wolves howl
289 0.030289126640769135 what is surplus killing
290 0.02499219116020307 are gray wolves native to north america
291 0.1643989873053573 is the gray wolf a mammal
292 0.029399051601892736 are a wolfs teeth its main weapons
293 0.01298591791445427 are wolves built for stamina
294

416 0.08304547985373997 did john adams go to harvard
417 0.4472135954999579 did john adams support the stamp act of 1765
418 0.4472135954999579 is adams birthplace part of a national park
419 0.31622776601683794 when did john adams serve as vice president
420 0.08304547985373997 with what party did adams run for presidency
421 0.01281945932506583 where is adams buried
422 0.1643989873053573 who were the midnight judges
423 0.12403473458920847 in what ways was adams opposed by anderw hamilton
424 0.08304547985373997 what information did he record in his diary
425 0.4472135954999579 who was defeated for reelection in the revolution of 1800 by thomas jefferson
426 0.1643989873053573 who represented the continental congress in europe
427 0.4472135954999579 what is now part of adams national historical park
428 0.1643989873053573 is it true that adams had spent some time as the ambassador
429 0.022721405353294154 is it true that massachus08setts sent him in 1774
430 0.08304547985373997 who 

549 0.1643989873053573 who was first apprenticed to a fuller to learn the clothmaking trade
550 0.4472135954999579 he founded the private university of buffalo on what date
551 0.030289126640769135 who or what fell in love with abigail powers
552 0.8320502943378437 was fillmore one of the founders of the university of buffalo
553 0.4472135954999579 was another primary objective of fillmore to preserve the union from the intensifying slavery debate
554 0.5547001962252291 was fillmore the second chancellor a position he maintained while both vice president and president
555 0.4472135954999579 is fillmore the first of two presidents to have been an indentured servant 
556 0.31622776601683794 is the comic strip mallard fillmore named after the president 
557 0.4472135954999579 was fillmore the first us president born after the death of a former president 
558 0.4472135954999579 did fillmore not turn down the honor  explaining that he had neither the  literary nor scientific attainment  to 

661 0.026657190238980555 have thumbpolar bears been made both controversial and famous for their distinctive white fur and their habitat 
662 0.4472135954999579 did mitchell taylor  the nunavut government manager of wildlife research  not write a letter to the us fish and wildlife service arguing that local studies are insufficient evidence for global protection at this time 
663 0.1643989873053573 are cubs born in december without awakening the mother 
664 0.31622776601683794 does qatar rank as the eighth richest country in the world per capita
665 0.004651112481478345 do nearly all qataris profess islam
666 0.01999600119960014 can women legally drive in qatar
667 0.08304547985373997 when did qatar become an independent state
668 0.01281945932506583 where is qatars telecommunication system centered
669 0.08304547985373997 where did a suicidebombing kill a teacher in 2005
670 0.4472135954999579 does the native pronunciation of qatar sound like cutter
671 0.1643989873053573 what is the 

761 0.08304547985373997 why did roosevelt start boxing lessons
762 0.4472135954999579 what was the nickname of theodore roosevelts sister anna
763 0.4472135954999579 what was roosevelts justification for supporting desegregation of schools in new york
764 0.1643989873053573 how did newspapers respond to roosevelt inviting brooker t washington to dinner at the white house
765 0.08304547985373997 did theodore roosevelt study judo
766 0.4472135954999579 did roosevelt receive a medal of honor
767 0.08304547985373997 did roosevelt study biology
768 0.12403473458920847 where was theodore roosevelt from
769 0.12403473458920847 who was roosevelts presidential hero
770 0.08304547985373997 when did roosevelt die
771 0.02630668208823282 who helped to fund roosevelts african safari
772 0.08304547985373997 did roosevelt support racial integration in schools
773 0.12403473458920847 was roosevelts family rich
774 0.004807636746383068 are turtles ectothermic
775 0.31622776601683794 does the mother car

878 0.08304547985373997 what field did woodrow wilson leave law practice to study
879 0.1643989873053573 what caused wilson to ask congress to declare war on the central powers
880 0.12403473458920847 what was more damaging than moving students into colleges
881 0.4472135954999579 was wilson a member of the phi kappa psi fraternity
882 0.12403473458920847 was wilson an automobile enthusiast
883 0.08304547985373997 did wilsons father own slaves
884 0.01281945932506583 where is wilson buried
885 0.08304547985373997 where did wilson attend law school
886 0.12403473458920847 where was woodrow wilson born
887 0.08304547985373997 did wilson support desegregation
888 0.1643989873053573 did wilson support the committee system
889 0.08304547985373997 did wilson have any siblings
890 0.12403473458920847 what was scotsirish and scottish
891 0.030289126640769135 what defended slavery owned slaves and s08set up a sunday school for them
892 0.08304547985373997 who did wilson win in 1917
893 0.164398

1022 0.4472135954999579 what is cello an abbreviation of
1023 0.1643989873053573 what position is used to play the cello
1024 0.1643989873053573 what is a person who plays the cello called
1025 0.029399051601892736 why is there purling on a cello
1026 0.31622776601683794 when did the first educational works appear for the cello
1027 0.030289126640769135 what cello manufacturer should i buy from if i want to play outside
1028 0.4472135954999579 was charlesaugustin de coulomb a member of the national institute
1029 0.08304547985373997 did charlesaugustin de coulomb find any relationship between electric charges and magnetic poles
1030 0.12403473458920847 was charlesaugustin de coulombs fathers family in montpellier
1031 0.08304547985373997 where did charlesaugustin de coulomb die
1032 0.12403473458920847 when was charlesaugustin de coulomb permanently stationed in paris
1033 0.4472135954999579 what contribution did charlesaugustin de coulomb make to the field of geotechnical engineering


1158 0.12403473458920847 was henri becquerel a french physicist
1159 0.08304547985373997 in what year did henri becquerel die
1160 0.12403473458920847 where was henri becquerel born
1161 0.1643989873053573 in what year did henri becquerel win the nobel prize in physics
1162 0.31622776601683794 was henri becquerel first in his family to occupy the physics chair at the museum national dhistoire naturelle
1163 0.4472135954999579 was henri becquerel the sole winner of the 1903 nobel prize in physics
1164 0.08304547985373997 did henri becquerel intentionally discover radioactivity
1165 0.12403473458920847 if henri becquerel was alive today how old would he have been
1166 0.08304547985373997 for how many years did henri becquerel live
1167 0.08304547985373997 did henri becquerel live to be 80 years old
1168 0.12403473458920847 what was henri becquerels profession
1169 0.12403473458920847 how old was henri becquerel when he died
1170 0.4472135954999579 how many years ago was it when he became

1320 0.030289126640769135 what is santiago
1321 0.002380945632249836 has swahili no diphthongs
1322 0.1643989873053573 is a third  prefix the object  prefix
1323 0.01281945932506583 is       ref swahili unusual among subsaharan languages
1324 0.4472135954999579 are vowels never reduced  regardless of stress 
1325 0.1643989873053573 does the australian black swan have white feathers on its wings
1326 0.1643989873053573 do swans belong to the family anatidae
1327 0.006944277001512321 can black swans swim with only one leg
1328 0.030289126640769135 what are young swans known as
1329 0.7071067811865475 what is the irish legend of the children of lir about
1330 0.1643989873053573 what is the sanskrit word for swan
1331 0.31622776601683794 which album was the song the bonny swans from
1332 0.1643989873053573 what is the polish word for swan
1333 0.12403473458920847 which ballet by pyotr tchaikovsky is partially based on an ancient german legend about a princess who was turned into a swan
133

1462 0.03224129401095805 are termites actually more closely related to cockroaches as well as mantids
1463 0.03224129401095805 do some caterpillars produce vibrations as well as sounds
1464 0.4472135954999579 did ants evolve from wasplike ancestors in the midcretaceous period between 110 and 130 million years ago and diversified after the rise of flowering plants
1465 0.1643989873053573 is the ant a marsupial
1466 0.058722021951470346 are they easily identified by their elbowed antennae and a distinctive nodelike structure that forms a slender waist
1467 0.029399051601892736 is antwerp a city
1468 0.029399051601892736 is antwerp a municipality
1469 0.01999600119960014 is antwerp in belgium
1470 0.030289126640769135 what is antwerp
1471 0.7071067811865475 what is the population of the city of antwerp
1472 0.4472135954999579 where is the city of antwerp
1473 0.013512279797026393 why is antwerp important to fashion
1474 0.4472135954999579 antwerp is to the east of what river
1475 0.047565

1610 0.1643989873053573 does every drumhead make the same sound
1611 0.01999600119960014 arent drums often used in music therapy
1612 0.1643989873053573 arent drums usually played by the hands
1613 0.1643989873053573 are drums usually played by the hands
1614 0.04163054471218133 do many such drums have six to ten tension rods
1615 0.8320502943378437 do drums consist of at least one membrane called a drumhead or drum skin that is stretched over a shell and struck either directly with parts of a players body or with some sort of implement such as a drumstick to produce sound
1616 0.1643989873053573 have other techniques been used to cause drums to make sound such as the thumb roll
1617 0.1643989873053573 is the electric eel a true eel
1618 0.01234473828166939 are most eels predators
1619 0.01281945932506583 is eel blood toxic to humans
1620 0.4472135954999579 how many species of true eels are there
1621 0.029399051601892736 where is smoked eel considered a delicacy
1622 0.047565149415449

1739 0.4472135954999579 in ad 39 king purnawarman established sunda pura as a new capital city for the kingdom located at the northern coast of where
1740 0.1643989873053573 in 1602 the british east india companys first voyage commanded by sir who arrived in aceh and sailed on to banten where they were allowed to build a trading post
1741 0.4472135954999579 who began control of migration to the city in order to stem the overcrowding and poverty
1742 0.1643989873053573 did jayawikartas soldiers attack the dutch fortress
1743 0.31622776601683794 did suharto resign as president
1744 0.7071067811865475 was the jakarta area part of the fourth century indianized kingdom of tarumanagara
1745 0.1643989873053573 do other landmarks include the istiqlal mosque as well as jakarta cathedral
1746 0.4472135954999579 give an example of the many sukarno era monuments in the city
1747 0.1643989873053573 is jakarta the countrys economic cultural and political center
1748 0.01281945932506583 is james watt

1837 0.4472135954999579 what family is malay a member of
1838 0.1643989873053573 what is the basic word order in malay
1839 0.030289126640769135 what languages has malay borrowed words from
1840 0.04163054471218133 are there many words in malay that use natural gender
1841 0.047565149415449405 how are new words formed in malay
1842 0.4472135954999579 is malay in the austronesian family of languages
1843 0.01999600119960014 is there one negation word in malay
1844 0.4472135954999579 how many parts of speech are there in malay
1845 0.1643989873053573 what is the national language in malaysia
1846 0.029399051601892736 do linguists consider malay to be a single language
1847 0.4472135954999579 can speakers of modern malay understand old malay
1848 0.1643989873053573 what language besides malay is in the austronesia language family
1849 0.01999600119960014 wasnt malay language found in sumatra
1850 0.0024937578318730766 wasnt malay language written using pallava
1851 0.12403473458920847 was

1987 0.1643989873053573 what happened in the january 15 1882
1988 0.030289126640769135 what happened in 1887
1989 0.4472135954999579 is one of the best known impressionist works renoirs 1876 dance at le moulin de la galette
1990 0.4472135954999579 was pierreauguste renoir born in limoges hautevienne france the child of a working class family
1991 0.4472135954999579 have two of renoirs paintings sold for more than us0 million
1992 0.7071067811865475 as a celebrator of beauty and especially feminine sensuality has it been said that renoir is the final representative of a tradition which runs directly from rubens to watteau
1993 0.4472135954999579 did pierreauguste renoir die in limoges hautevienne france the child of a working class family
1994 0.08304547985373997 as a boy did he work in a porcelain factory where his drawing talents led to him being chosen to paint designs on fine china
1995 0.4472135954999579 is portuguese an official language of andorra
1996 0.31622776601683794 does th

2101 0.1643989873053573 do many players use a smaller mouthpiece on the piccolo trumpet
2102 0.4472135954999579 is the trumpet constructed of brass tubing bent twice into an oblong shape
2103 0.7071067811865475 give an example of the most influential musicians of the 20th century
2104 0.31622776601683794 were slide trumpets the first trumpets allowed in the christian church
2105 0.4472135954999579 are they constructed of brass tubing bent twice into an oblong shape and are played by blowing air through closed lips producing a buzzing sound which starts a standing wave vibration in the air column inside the trumpet
2106 0.4472135954999579 are there several types of trumpet
2107 0.4472135954999579 was the ottoman script replaced with a variant of the latin alphabet
2108 0.029399051601892736 is there a definite article in turkish language
2109 0.4472135954999579 is it possible to alter the word order to stress the importance of a certain phrase
2110 0.4472135954999579 what is the official

## TFIDF Vectorizer

#### Clean sentences and split it into words

In [95]:
sentences_tfidf=cleaned_sentence_sp
sentence_words_tfidf=[[word for word in document.split()] for document in sentences_tfidf]

In [96]:
sentence_words_tfidf

[['was',
  'abraham',
  'lincoln',
  'the',
  'sixteenth',
  'president',
  'of',
  'the',
  'united',
  'states'],
 ['did', 'lincoln', 'sign', 'the', 'national', 'banking', 'act', 'of', '1863'],
 ['did', 'his', 'mother', 'die', 'of', 'pneumonia'],
 ['how', 'many', 'long', 'was', 'lincolns', 'formal', 'education'],
 ['when', 'did', 'lincoln', 'begin', 'his', 'political', 'career'],
 ['what', 'did', 'the', 'legal', 'tender', 'act', 'of', '1862', 'establish'],
 ['who', 'suggested', 'lincoln', 'grow', 'a', 'beard'],
 ['when',
  'did',
  'the',
  'gettysburg',
  'address',
  'argue',
  'that',
  'america',
  'was',
  'born'],
 ['did',
  'lincoln',
  'beat',
  'john',
  'c',
  'breckinridge',
  'in',
  'the',
  '1860',
  'election'],
 ['was',
  'abraham',
  'lincoln',
  'the',
  'first',
  'president',
  'of',
  'the',
  'united',
  'states'],
 ['did', 'lincoln', 'start', 'his', 'political', 'career', 'in', '1832'],
 ['did', 'lincoln', 'ever', 'represent', 'alton', 'sangamon', 'railroad'],


#### Initialize TfidfVectorizer and transform each question to corresponding vector/sparse matrix.

In [97]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(data['Question']))
print(tfidf_matrix.shape)

(2185, 4078)


In [98]:
print(tfidf_vectorizer.get_feature_names())

['000', '10', '100', '1000', '101', '11', '110', '1251', '1255', '12th', '13', '130', '14', '15', '15th', '16', '1602', '1658', '1701', '1703', '1713', '1728', '1730', '1732', '1736', '1738', '1744', '1745', '1757', '1759', '1763', '1764', '1765', '1772', '1774', '1776', '1785', '1794', '1796', '17th', '1800', '1800s', '1806', '1809', '181', '1810', '1811', '1816', '1817', '1819', '1820', '1828', '1832', '1833', '1838', '1848', '185', '1850', '1854', '1855', '1860', '1862', '1863', '1865', '1867', '1869', '1872', '1874', '1876', '1880', '1881', '1882', '1884', '1887', '1890', '1892', '1894', '1896', '1898', '1899', '18th', '1903', '1905', '1907', '1908', '1910', '1913', '1914', '1917', '1918', '1919', '1924', '1928', '1930', '1937', '1942', '1946', '1947', '1948', '1949', '1950', '1951', '1965', '1968', '1970', '1971', '1973', '1974', '1977', '1979', '1980s', '1990s', '1991', '1993', '1994', '1995', '1996', '1998', '19th', '1st', '20', '2000', '2001', '20013', '2003', '2005', '2006', '

#### Method that returns question and answer.First the question converted to tfidf vector,then cosine similarity of question vector compared with each of the question vectors,that with maximum cosine similarity would be displayed and corresponding answer provided 

In [99]:
def ask_question(question):
    ques_tfidf=tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(ques_tfidf, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    print("Question :",question)
    print("Retrieved Question :",data.iloc[max_similarity]['Question'])
    print("Answer :",data.iloc[max_similarity]['Answer'])

In [100]:
ask_question("When Abraham Lincoln started his political career?")

Question : When Abraham Lincoln started his political career?
Retrieved Question : When did Lincoln begin his political career?
Answer : 1832


In [101]:
ask_question('Where was Nicola Tesla born')

Question : Where was Nicola Tesla born
Retrieved Question : Where was Volta born?
Answer : Como


In [102]:
ask_question('Can whales fly')

Question : Can whales fly
Retrieved Question : Do sea otters have a layer of fat like whales?
Answer : No


In [103]:
ask_question('Who was the third president of the United States')

Question : Who was the third president of the United States
Retrieved Question : Was James Monroe President of the United States?
Answer : yes


In [104]:
ask_question('How high are crime rates in Brazil')

Question : How high are crime rates in Brazil
Retrieved Question : When did the crime level become higher?
Answer : After the October revolution.


## Word to Vector

#### Load and save gloVe model first and load the same saved model every other time

In [105]:
glove_model=None
try:
    glove_model=gensim.models.KeyedVectors.load('./glovemodel.mod')
    print("Loaded glove model")
except:
    glove_model=api.load("glove-twitter-25")
    glove_model.save('./glovemodel.mod')
    print("Saved glove model")

Loaded glove model


#### Load and save word2vector model first and load the same saved model every other time

In [106]:
w2v_model=None
try:
    w2v_model=gensim.models.KeyedVectors.load('./w2vecmodel.mod')
    print("Loaded Word2Vec model")
except:
    w2v_model=api.load("word2vec-google-news-300")
    w2v_model.save('./w2vecmodel.mod')
    print("Saved Word2Vec model")
    

Loaded Word2Vec model


#### Word to vector word embedding size is 300 and GloVe embedding size is 25

In [107]:
w2vec_embedding_size=len(w2v_model['computer'])
glove_embedding_size=len(glove_model['computer'])

In [108]:
w2vec_embedding_size,glove_embedding_size

(300, 25)

#### Get Word to Vector embedding of each word.In case it is a new word,a zero vector is updated

In [109]:
def getwordvec(word,model):
    samp=model['computer']
    vec=[0]*len(samp)
    try:
        vec=model[word]
    except:
        vec=[0]*len(samp)
    return(vec)

#### For each question,vector representation of words get added up and that would be the word to vector embedding for the sentence

In [110]:
def getphraseembedding(phrase,embeddingmodel):
    samp=getwordvec('computer',embeddingmodel)
    vec=np.array([0]*len(samp))
    den=0
    for word in phrase.split():
        den=den+1
        vec=vec+np.array(getwordvec(word,embeddingmodel))
    return(vec.reshape(1,-1))

#### For each questions,get word to vector embedding,also get word to vector embedding of the provided question,compare cosine similarities and find the most similiar result.

In [111]:
sent_embeddings=[]
for sent in cleaned_sentence:
    sent_embeddings.append(getphraseembedding(sent,w2v_model))
question_embedding=getphraseembedding(question,w2v_model)
retrieve_answer(question_embedding,sent_embeddings,data,cleaned_sentence)

0 0.4489298995863628 abraham lincoln sixteenth president united states
1 0.4774221084558571 lincoln sign national banking act 1863
2 0.20205763800656087 mother die pneumonia
3 0.3007230053421099 long lincolns formal education
4 0.8492293200336524 lincoln begin political career
5 0.207581883294478 legal tender act 1862 establish
6 0.4428369050659713 suggested lincoln grow beard
7 0.32947266254346314 gettysburg address argue america born
8 0.5142144063314268 lincoln beat john c breckinridge 1860 election
9 0.44037257556700204 abraham lincoln president united states
10 0.8794091729493567 lincoln start political career 1832
11 0.4025887206185619 lincoln represent alton sangamon railroad
12 0.4805917680332269 county lincoln born
13 0.365036732990256 lincoln serve president
14 0.4869323341255876 assassinated lincoln
15 0.5134411546768751 lincoln win election 1860
16 0.17987077791880685 general charge battle antietam
17 0.44317581593067795 lincoln issue emancipation proclamation
18 0.26136296

166 0.47815552059705096 egypt political influence middle east
167 0.3406479809639195 day war
168 0.30520592141015335 religions egypt outlawed
169 0.2667355011301853 poulation egypt
170 0.24771828820015313 egypts population live near nile
171 0.31256265020898794 large number jews living egypt today
172 0.19901363003579242 period akhenaten pharaoh
173 0.2793931963032718 egypt asia
174 0.20812073792560679 egypt receive rainfall world
175 0.23933043013211455 egypts foreign policy operates moderate lines
176 0.32818728332109187 egypt republic
177 0.24143879869829318 great sphinx pyramids giza built
178 0.2182025945823695 countries border egypt
179 0.1972188007561361 estimated population egypt
180 0.28865296393154966 snow fall egypt
181 0.2772779524818932 organized agriculture appear nile valley
182 0.15923953742560967 elephants largest land animals alive today
183 0.15467647508282467 elephant kill rhinoceros
184 0.16009226792500258 elephants good swimmers
185 0.2204038971135065 living speci

481 0.09692349619315771 difference leopards cheetahs
482 0.13665995288575544 distribution size leopard compare distribution wild cats
483 0.29231026356755746 resembles similarlysized cougar americas
484 0.21089769510306822 species described linnaeuss 18thcentury work systema naturae
485 0.2816585334536477 sort cats solitary
486 0.10291080924415225 leopards circular
487 0.21184634977489536 leopard solitary
488 0.2638722517915216 centred sierra
489 0.007442831765175169 felis pardus
490 0.24777166480158835 new zealand rugby league featured otahuhu leopards tamaki leopards
491 0.28283456266781487 leopard men west african secret society practised cannibalism
492 0.2575458384790229 leopard tank german designed tank entered service 1965
493 0.1949897823186078 black color heritable caused recessive gene locus
494 0.18505785034458228 leopard lrb panthera pardus rrb old world mammal felidae family smallest big cats genus panthera tiger lion jaguar
495 0.1320646221141231 panther species large fel

685 0.21727017788956937 created long term perspective support development qatar wider region develop local regional markets strengthen links energy based economies global financial markets
686 0.3898856063509186 happened positions english
687 0.2933268017653086 allophones occur positions english
688 0.33352203886725473 rrb officially state qatar lrb arabic transliterated dawlat qatar rrb arab emirate southwest asia
689 0.2509885254244989 romania border hungary
690 0.3410400178706147 romania secular state
691 0.19574170413567032 president elected popular vote
692 0.1889437278511528 counties romania divided
693 0.2568031331280707 highest mountain romania
694 0.23772811787173517 official language romania
695 0.2673493280816673 romanian economy
696 0.2790404879431566 roma romania
697 0.2548628014065861 territories joined form romania
698 0.2349506080602227 romania share border ukraine
699 0.3251009305689971 romania declare neutrality world war
700 0.20920078763059968 romania share language

924 0.16004325101753156 passion study
925 0.3632721446904252 true volta married daughter count ludovico peregrini
926 0.24773485359662814 true published invention voltaic pile battery
927 0.2136924970528811 amedeo avogadro italian
928 0.19618565722290324 amedeo avogadro graduate
929 0.32256714382023116 avogadro live england
930 0.18055326857033205 avogadro professor physics
931 0.11250468253779981 children avogadro
932 0.09004627317650703 avogadro noted
933 0.06155970961268278 avogadros number
934 0.39388327713580157 year avogadro stop teaching turin university
935 0.06925439743636147 subject avogadro study
936 0.17437919237035698 amedeo avogadros birthplace
937 0.28741560015420536 amedeo avogadros profession
938 0.3058358381605628 years ago professor physics university turin
939 0.0 amedeo avogadros
940 0.0 amedeo avogadros
941 0.34418255461145597 amedeo avogadro born north america
942 0.22569588291227102 king victor emmanuel iii pay homage avogadro
943 0.35969192651610726 fact avogad

1105 0.3330422101105948 local variations formal written version language limited restricted largely spelling differences british american english
1106 0.2444854892584789 flute musical instrument
1107 0.2597053106056477 possible open flutes ends
1108 0.25727421582637633 indian concert flutes available standard pitches
1109 0.23425557219692214 refer musicians play flute
1110 0.2207556233251981 threeholed flute mammoth tusk discovered
1111 0.24816923880907235 tin whistle appear
1112 0.2531535094259752 oldest known musical instruments
1113 0.2947332081113681 pan flute spread parts europe
1114 0.18715941778088407 air stream hole create
1115 0.14069716237827176 foxes wary humans
1116 0.12555412427775225 fennec foxes endangered
1117 0.11962965516007339 diet foxes include reptiles
1118 0.207812729687535 long foxes live
1119 0.24517557134219914 country fox hunting originate
1120 0.16241583812607613 smallest species fox
1121 0.19057314526261512 female foxes called
1122 0.09792133077579872 cachin

1248 0.12460579728442489 faraday effect called
1249 0.18288688181884902 meet attending church
1250 0.39087105197430977 michael faradays birthplace
1251 0.4493161127058647 michael faradays profession
1252 0.2908507944621809 years ago faraday report synthesis compound carbon chlorine c 2 h 6 c 2 h 4
1253 0.23936098783545243 years ago discovered phenomenon named diamagnetism called faraday effect
1254 0.3732411433406763 years ago faraday wrote letter times subject foul condition river thames resulted oftreprinted cartoon punch
1255 0.2200005395345173 years ago discovered optical properties gold colloids differed corresponding bulk metal
1256 0.2527546935480969 education area service faraday
1257 0.306455170699347 faraday s earliest chemical work assistant davy
1258 0.3700615483913518 died house hampton court august 25 1867
1259 0.4039020876589672 michael faraday born
1260 0.16521540741938315 serve terms
1261 0.40085380530647735 born newington butts near presentday south london england
126

1425 0.13626973281670768 avogadros number compute results chemical reactions
1426 0.10521196061664048 calculated value avogadros number
1427 0.26288915928063117 avogadros law state
1428 0.15814357532530426 showed avogadros theory held dilute solutions
1429 0.12056385753260583 language 1811 paper published
1430 0.23645466479895802 avogadros wife
1431 0.24098811045107343 avogadro lose chair university turin
1432 0.19531186541641363 amedeo avogadro born turin
1433 0.13092929781099902 avogadro hailed founder atomicmolecular theory
1434 0.09651152536457518 johann josef loschmidt calculate value avogadros number
1435 0.18055326857033205 1820 avogadro professor physics
1436 0.283929324507229 avogadro actually use word
1437 0.25175252126454006 number elementary entities 1 mole substance known
1438 0.22913125739227821 german avogadros number
1439 0.2136924970528811 amedeo avogadro italian
1440 0.2509384712318144 professor revolutionary movements king sardinia
1441 0.18660141161560828 ants belon

1587 0.3570507543918504 sound like play cymbals cello bow
1588 0.1633112929697377 cymbals
1589 0.1780493647503353 cymbals typically drum kit
1590 0.2825837437949826 kinds musical groups use cymbals
1591 0.3337251644024571 dragonfly associated late summer early autumn
1592 0.22299836003837611 exposure air cause larva begin breathing
1593 0.21724122196509077 characterized large multifaceted eyes pairs strong transparent wings elongated body
1594 0.18067771471096492 drum member percussion group
1595 0.33289297905696763 type drum head serve musical purpose
1596 0.22447428240454284 aburukuwa type drum
1597 0.21739109275542118 long basic design drum remained unchanged
1598 0.30286866932009476 second biggest factor affecting sound produced drum
1599 0.1736802547458998 drums classified percussion group
1600 0.26081054798796277 rock roll drummers like drumheads
1601 0.2150309813429933 whare drums s10set wires held drum heads
1602 0.35187922245895775 drums sri lanka christ
1603 0.204072263668065

1784 0.3075589003784369 korean names language based names korea north south korea
1785 0.27275776329715806 official languages yanbian korean autonomous prefecture china
1786 0.2708556794255728 dialect spoken jeju located fact classified different language korean linguists
1787 0.27275776329715806 official languages yanbian korean autonomous prefecture china
1788 0.29520291215487293 kuala lumpur capitol malaysia disfluent
1789 0.21452133723381198 kuala lumpur selangor state
1790 0.29826006902237534 whats population kuala lumpur
1791 0.1683506750121403 language speak kuala lumpur
1792 0.25144243968683855 kuala lumpur mean
1793 0.04990194301947461 airports kuala lumpur
1794 0.3245463784400641 long kuala lumpur occupied japanese
1795 0.307023071093368 seasons kuala lumpur experience
1796 0.19060782740567372 colonial buildings kuala lumpur constructed
1797 0.23065717768627136 host city formula world championship
1798 0.12357557241836521 central market located proximity pertama complex
1799 

1981 0.2888064098985124 tall typical studio piano
1982 0.1567511237982797 pianos keys generally
1983 0.2970473264233734 total keys typical modern piano
1984 0.16474315509011805 upright pianos compact
1985 0.19302501962596624 older pianos keys modern pianos
1986 0.18563933009154926 names pianos pedals
1987 0.44460024396510706 happened january 15 1882
1988 0.30896778519098556 happened 1887
1989 0.32895072253084634 best known impressionist works renoirs 1876 dance le moulin la galette
1990 0.3549199872748569 pierreauguste renoir born limoges hautevienne france child working class family
1991 0.07209285267759327 renoirs paintings sold us0 million
1992 0.28212566540979517 celebrator beauty especially feminine sensuality said renoir final representative tradition runs directly rubens watteau
1993 0.33506795645737125 pierreauguste renoir die limoges hautevienne france child working class family
1994 0.326116279921257 boy work porcelain factory drawing talents led chosen paint designs fine chi

2122 0.25002377176705753 patterns vowel harmony
2123 0.23306718221078115 ways join groups nouns
2124 0.26381192998687164 vietnamese comparatively large number vowels
2125 0.36671350288826526 vietnamese mother tongue vietnamese people
2126 0.3918298484035892 vietnamese official administrative language vietnam
2127 0.3077021079374948 vietnamese influenced chinese
2128 0.2728271639193189 written vietnamese official administrative language
2129 0.2671819973863708 consonant clusters lost
2130 0.23255556068995603 red river delta traditionally poor
2131 0.36410126737859566 kind words borrowed chinese
2132 0.3877865498027691 vietnam independent france
2133 0.351598664484414 vietnamese formally written chinese writing
2134 0.2718695069760801 vietnamese large number vowels
2135 0.20817871109601252 dialect regions vietnamese
2136 0.37185766216968186 reunification vietnam
2137 0.15031715915901103 tones northern varieties
2138 0.3012099498074976 vietnamese borrow latin greek
2139 0.4094098872279506

#### For each questions,get GloVe embedding,also get GloVe embedding of the provided question,compare cosine similarities and find the most similiar result.

In [113]:
sent_embeddings=[]
for sent in cleaned_sentence_sp:
    sent_embeddings.append(getphraseembedding(sent,glove_model))
question_embedding=getphraseembedding(question,glove_model)
retrieve_answer(question_embedding,sent_embeddings,data,cleaned_sentence_sp)

0 0.9580181303477824 was abraham lincoln the sixteenth president of the united states
1 0.9669049899990965 did lincoln sign the national banking act of 1863
2 0.9351333695776394 did his mother die of pneumonia
3 0.9492457775824398 how many long was lincolns formal education
4 0.9881312970352818 when did lincoln begin his political career
5 0.9494828246673204 what did the legal tender act of 1862 establish
6 0.9440700332457146 who suggested lincoln grow a beard
7 0.9612005078434415 when did the gettysburg address argue that america was born
8 0.9405430776823624 did lincoln beat john c breckinridge in the 1860 election
9 0.9623665031836004 was abraham lincoln the first president of the united states
10 0.9820805560963848 did lincoln start his political career in 1832
11 0.871313747302786 did lincoln ever represent alton  sangamon railroad
12 0.9312954391537429 which county was lincoln born in
13 0.9601102108955967 when did lincoln first serve as president
14 0.911224317491738 who assassi

160 0.9410956999298378 what are some economic uses for duck
161 0.9244505841868529 what allows a duck to filter water out of the side of their beaks and keep food inside
162 0.9129515442543589 what expression is part of a conceptual framework for testing computer systems
163 0.9317164880508704 where did the expression a sitting duck come from
164 0.9141537087005527 is egypt bordered by the gaza strip
165 0.9087291105899683 is egypt the most populated country in africa
166 0.9516594825598316 does egypt have political influence in the middle east
167 0.9362851793192143 when was the six day war
168 0.8557054797663397 what religions has egypt outlawed
169 0.9368178271512902 what is the poulation of egypt
170 0.9514546028214912 why does most of egypts population live near the nile
171 0.9269550184835502 are there a large number of jews living in egypt today
172 0.9594814961514937 during what period was akhenaten a pharaoh
173 0.867526841280679 is egypt in asia
174 0.9420213404495823 does eg

301 0.928054762449951 did grover cleveland win the 1884 election
302 0.853374736757163 did grover cleveland support womens suffrage
303 0.9249627503315406 where was grover cleveland married
304 0.9187886607242561 what did cleveland die from
305 0.9550831268315009 how many years after cleveland left office did the us win the spanishamerican war
306 0.9558434342348943 what did clevelands opponents say in 1884 to counter his innocent image
307 0.9446509720353833 who did cleveland run against in 1884
308 0.9552111160564002 why did cleveland want to hide his cancer surgery from the public
309 0.9512925299379966 was grover cleveland the twentyseventh president of the united states
310 0.9275151301107469 is grover cleveland honest
311 0.936690173215435 was grover cleveland married in the whitehouse
312 0.9561871728431357 when was he elected sheriff of erire county new york
313 0.8681776952224594 when did he die
314 0.924327250114878 which election did grover cleveland win
315 0.88892993807751

420 0.956499456575335 with what party did adams run for presidency
421 0.93225912601785 where is adams buried
422 0.9479363631497312 who were the midnight judges
423 0.9601539125741554 in what ways was adams opposed by anderw hamilton
424 0.9559499132959953 what information did he record in his diary
425 0.9629792992511647 who was defeated for reelection in the revolution of 1800 by thomas jefferson
426 0.9271213894888659 who represented the continental congress in europe
427 0.9425138841043863 what is now part of adams national historical park
428 0.9548670483706819 is it true that adams had spent some time as the ambassador
429 0.922179923334774 is it true that massachus08setts sent him in 1774
430 0.9171879555441945 who did massachus08setts send in 1774
431 0.9621409223288601 are his last words often quoted as  thomas jefferson survives   
432 0.9490036737327378 the john adams library  housed at the boston public library  contains what
433 0.9595680331070087 adams  opponents were wh

573 0.9266990102571718 how many species and genera does otter have
574 0.8387284539590343 do otters live in water
575 0.8306232282637029 where do sea otters live
576 0.9357610759867029 where does the word otter derive from
577 0.8794658334623946 does otter give birth or lay egg
578 0.9327636157360658 what drives sea otter almost to extinction
579 0.8706813323688259 why otters are considered as totem animals
580 0.873974876991655 do sea otters live along the pacific coast
581 0.7237154616040662 are otters totem animals
582 0.549089473568217 are otters herbivores
583 0.9084149839138487 what is the primary item in an otters diet
584 0.8965698013011716 what is an otters den called
585 0.9239020605901034 why is the giant otter becoming increasingly rare
586 0.852089289257755 how do otters keep themselves warm without blubber
587 0.8042886903375895 how are otters playful animals
588 0.892054893319384 what animals are related to otters
589 0.8822776708670838 what traps a layer of air and keep

847 0.9160462440872367 how many square kilometres of continental land is uruguay
848 0.9360345651646481 how much of the population is of white european descent
849 0.9496205481962751 why was the capital of uruguay founded
850 0.9402518846215748 how many times has uruguay won the world cup
851 0.9450695279200929 how much is the average income of an african woman compared to a european man
852 0.8599229600530038 does uruguay recognize samesex civil unions
853 0.8227858293107465 does uruguay border french guiana
854 0.8443157653592734 is uruguays warmest month june
855 0.9343015473590297 what does a citizen use to propose changes to the constitution
856 0.91436058312855 what religions are found in uruguay
857 0.939922281263632 what has uruguay done to be competitive in agriculture
858 0.949606161765844 what are the names of uruguays political parties
859 0.9307276065833399 is uruguay the smallest soverign nation in south america
860 0.9608125848091319 what was founded by the spanish in th

966 0.8289923421704057 was celsius born in uppsala  in sweden
967 0.9456380489681173 is the celsius  crater  on the moon named after him
968 0.8594966671783211 was anders celsius   november 27   1701  april 25   1744   a swedish  astronomer
969 0.9425426718134349 was anders celsius the first  to perform  and  publish  careful experiments  aiming at the definition  of an international  temperature  scale  on scientific grounds
970 0.8774643067771473 was he professor at uppsala university
971 0.9402080318005384 is it true that he published a collection of 316 observations
972 0.9280220105843763 he published a collection of what
973 0.8941705549346274 can syllables begin with a vowel
974 0.9309435023585777 are calligraphers held in great esteem
975 0.9205679238321475 have many european languages borrowed numerous words from it
976 0.9302883158447892 are the most active in damascus and cairo
977 0.842029412589044 do arabs consider literary arabic
978 0.9501220496707612 the sociolinguistic 

1104 0.9286491764051327 are many words describing the navy  types of ships  and other objects or activities on the water of dutch origin 
1105 0.9487611168293365 are local variations in the formal written version of the language quite limited  being restricted largely to the spelling differences between british and american english 
1106 0.902991845337413 is the flute a musical instrument
1107 0.9269545720690078 is it possible to open flutes at one or both ends
1108 0.8911219920987962 are indian concert flutes available in standard pitches
1109 0.9123408304802284 what do we refer musicians who play flute
1110 0.9212996395924643 when was a threeholed flute made from a mammoth tusk discovered
1111 0.9276318069698011 when did the tin whistle first appear
1112 0.9452509888159071 what are the oldest known musical instruments
1113 0.9401077179140456 when was the pan flute spread to other parts of europe
1114 0.9248163164467129 what does the air stream across this hole create
1115 0.857496700

1269 0.8787977838172225 what is nassau coliseum
1270 0.9559550462321096 what was the dead or alive 4 fighting arena modeled after
1271 0.9108478083691648 is nassau range the highest mountain range in the world
1272 0.922170213547214 is nassau county named after a german town
1273 0.9311054566435661 does the united states have a base near glasgow
1274 0.8041662502691226 did tesla study electrical engineering
1275 0.9392678805825689 was tesla born in the united states
1276 0.8947764810718148 was tesla hired by edison
1277 0.9173861598667814 when did tesla demonstrate wireless communication radio
1278 0.8680452780102828 what is the si unit measuring magnetic flux density or magnetic induction
1279 0.929383974986164 was tesla regarded as a mad scientist
1280 0.9327897697682809 what type of current did tesla invent
1281 0.9620340448302294 who was the victor of the war of currents
1282 0.8597260001858912 where did tesla study electrical engineering
1283 0.9214970671585506 is ottawa the capit

1441 0.8783542798141939 do ants belong to the hymenoptera order
1442 0.8654185216495949 are ants used in cuisine
1443 0.8317945684438949 does an ants head contain sensory organs
1444 0.8995334609559497 how do most ants travel
1445 0.9258356331015205 in ant colonies what are the fertile female ants called
1446 0.946540469083734 who wrote about ants in a tramp abroad
1447 0.8548852638307953 do the ants eat plants meats or both
1448 0.9279693160945889 what organs gives a bull ant its good sight
1449 0.9042665138066615 what may happen to red fire ants if we use boiling water on the queen
1450 0.8749804732446342 are ants found in antartica
1451 0.8879541279593363 do male ants take flight before females
1452 0.8741906722346872 do worker ants have wings
1453 0.9464206047014531 what are the three segments of an ant
1454 0.9518289990591035 what are the ant colonies that lack queens called
1455 0.8742160699701582 where are bullet ants located
1456 0.8879588424201116 do ants belong to the same or

1641 0.9550099975678293 why is a police whistle very wide for its pitch
1642 0.9104521863627512 what are the two main varieties of indian flutes
1643 0.9534734446823511 has the flute been dated to prehistoric times
1644 0.9150474745412086 was the pan flute used in greece
1645 0.9375869572339999 can a flute be played with several different air sources
1646 0.9279260547614434 when was the pan flute used in greece
1647 0.882191423456317 when did concert flutes begin appearing in concert ensembles
1648 0.9107438969570492 how many main varieties of indian flutes are currently used
1649 0.9219651419860095 what material is a chi flute fashioned from
1650 0.9526622012713252 how does a flute player change the pitch of the sound
1651 0.9389521837591045 what is the most basic form of the flute
1652 0.9217793655695825 has the dragon historically served as chinas national emblem
1653 0.6312155258591439 do pandas hibernate
1654 0.7924008777134663 do giant pandas attack humans
1655 0.9299833821981404

1761 0.9444579586763914 is the long term viability of the koala therefore threatened by genetic weakness
1762 0.9059028645133623 is the koala found in coastal regions of eastern and southern australia from near adelaide to the southern part of cape york peninsula
1763 0.885079737170215 do populations also extend for considerable distances inland in regions with enough moisture to support suitable woodlands
1764 0.9562038444028571 were the koalas of south australia largely exterminated during the early part of the 20th century but the state has since been repopulated with victorian stock
1765 0.9211086017308817 is korean the official language of korea
1766 0.9163724616449976 is the word korean derived from goryeo
1767 0.9243976821850631 are all dialects of korean similar to each other
1768 0.9208863425670522 about how many koreans speakers are there
1769 0.9346785322141951 older english sources used the name korean to refer to what
1770 0.9404729199788697 what is the official language o

1897 0.9581875124561743 where was much of montreals industry during the late 19th and earlytomid 20th century
1898 0.9182055809099379 does montreal contain the largest church in canada
1899 0.9386783525458946 what is the largest primarily frenchspeaking city in the western world
1900 0.9294734610076034 is nairobi the capital of kenya
1901 0.8732144164526587 was nairobi founded in 1899
1902 0.9267616580548471 is the current estimated population of nairobi about 6 milion
1903 0.9344584022134075 what is the current estimated population of nairobi
1904 0.930230020303476 in what year was nairobi founded
1905 0.8755326874511303 which embassy in nairobi was bombed in august 1998
1906 0.9193060286058247 how many civilians died in the 1998 us embassy bombing
1907 0.9083598359834455 how many trades can the nse make per day
1908 0.9329266473789676 about how many people are estimated to live in nairobi
1909 0.9387513192223964 the district is bordered to the southwest by uhuru park and where
1910 0

2043 0.9150248735610856 does the de young museum house the asian art museum
2044 0.929158728412794 what prompted the city to upgrade its building codes
2045 0.9359811511112076 like many larger where cities san francisco is a minoritymajority city as nonhispanic whites comprise less than half of the population
2046 0.9471333779799246 who laid out a street plan for the expanded s10settlement
2047 0.9133094844038578 when was a 43acre mission bay campus opened
2048 0.9151843906405608 when did it sign on the air
2049 0.9421774557672169 give an example of the largest twoyear community colleges in the country
2050 0.9277528015262856 give an example of the ten beta world cities
2051 0.9396396390846761 the only consolidated citycounty in california does it encompass a land area of on the northern end of the san francisco peninsula making it the second most densely populated city in the united states
2052 0.9227054176478735 is san francisco also the financial cultural and transportation center o

Question : when abraham lincoln started his political career
Retreived Question: When did Lincoln begin his political career?
Retreived Answer: 1832
