In [3]:
import numpy as np
import pandas as pd
import nltk
import pickle
import re
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')

In [5]:
data=df[['Question','Answer']]
data =data.drop_duplicates(subset='Question')
data=data.dropna()
data.head(10)

Unnamed: 0,Question,Answer
0,Was Volta an Italian physicist?,yes
2,Is Volta buried in the city of Pittsburgh?,no
4,Did Volta have a passion for the study of elec...,yes
6,What is the battery made by Volta credited to be?,the first cell
8,What important electrical unit was named in ho...,the volt
10,Where did Volta enter retirement?,Spain
12,Is it a disadvantage for something to be unsaf...,yes
14,Was Lombardy under Napoleon's rule in 1800?,yes
16,Was the Italian 10.000 lira banknote created b...,yes
18,For how many years did Alessandro Volta live?,53


In [6]:
data.shape

(498, 2)

In [7]:
sequence_length = 50

In [8]:
def clean_str(string):
    string = string.strip().lower()
    string = re.sub(r"[^A-Za-z0-9()<>:;,.!?\'\"]", " ", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\.", " . ", string)
    string = re.sub(r"\?", " ? ", string)
    return string

In [9]:
def answer_generate():
  dataset_marker = "S09_question_answer_pairs"
  pretrained_embeddings = pickle.load(open("input/%s_pretrained_embeddings.dat" % dataset_marker, "rb"))
  embedding_dimension = len(pretrained_embeddings["."])
  word_dict = {"PAD": 0, "UNK": 1}
  word_embeddings = [[0.0 for j in range(embedding_dimension)] for i in range(len(word_dict))]

  for word in pretrained_embeddings:
    word_dict[word] = len(word_dict)
    word_embeddings.append(pretrained_embeddings[word])

  answer_list = []
  for index,line in data.iterrows():
    answer = clean_str(line['Question']+line['Answer'])
    answer = nltk.word_tokenize(answer)
    answer = [word_dict.get(word, 1) for word in answer]
    answer = answer[:sequence_length] if len(answer) >= sequence_length else answer + [0] * (sequence_length - len(answer))
    answer_list.append(answer)
  answer_embedding = np.array(answer_list)
  return word_dict, answer_embedding

In [10]:
def question_tr(word_dict,question):
    question = clean_str(question)
    question = nltk.word_tokenize(question)
    question = [word_dict.get(word, 1) for word in question]
    question = question[:sequence_length] if len(question) >= sequence_length else question + [0] * (sequence_length - len(question))
    return question

In [11]:
def nearest_answer(question_embedding,answer,sentence):
    max_simalirty=-1;
    index_simalrit=-1;
    question_embedding = np.array(question_embedding)
    for index,answer_embedding in enumerate(answer):
      simlarity=cosine_similarity(answer_embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
      if simlarity>max_simalirty:
          max_simalirty=simlarity
          index_simalrity=index
    
    return max_simalirty, sentence.iloc[index_simalrity,1]

In [16]:
def chat():
    print("Please type in your question")
    word_dict, answers = answer_generate()
    while True:
        question = input("You: ")
        if question.lower() == "quit":
            break
        question_embedding = question_tr(word_dict, question)
        similarity, answer = nearest_answer(question_embedding,answers,data)
        if similarity > 0.7:
          print(answer)
        else:
          print("I did not understand, please try again.")

chat()

Please type in your question
You: What important electrical unit was named in honor of Volta?
the volt
You: For how many years did Alessandro Volta live?
53
You: How old was Alessandro Volta when he died?
 82
You: Where did Volta enter retirement?
Spain
You: How many children did Avogadro have?
six
You: quit
