In [1]:
import pandas as pd
import torch
from sentence_transformers.util import semantic_search
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def model_generator():
    model_id = "sentence-transformers/all-MiniLM-L6-v2"
    return SentenceTransformer(model_id)

In [3]:
def dataset_generator(text, csv_write=True):
    tables = [pd.read_html(text)[i] for i in range(len(pd.read_html(text)))]
    table = pd.concat(tables, ignore_index=True, sort=False)
    questions = list(table['Question'].values)
    answers = list(table['Answer'].values)
    dataset = list(list(i) for i in zip(questions, answers))
    questions = [dataset[i][0] for i in range(len(dataset))]
    answers = [dataset[i][1] for i in range(len(dataset))]
    if csv_write == True:
        table.columns=["0", "1"]
        table.to_csv('dataset_english.csv', index=False, columns=['0', '1'])

    return questions, answers

In [4]:
def embedding_func(questions, model, csv_write=True):
    output = model.encode(questions)
    embeddings = pd.DataFrame(output)
    if csv_write == True:
        embeddings.to_csv("embeddings_english.csv", index=False)
    dataset_embeddings = torch.from_numpy(embeddings.to_numpy()).to(torch.float)
    print(f"The Length of the Dataset is: {len(dataset_embeddings)}")
    return dataset_embeddings
    

In [5]:
model = model_generator()

In [6]:
table_file_name = 'dataset_tables_english.txt'

In [7]:
questions, answers = dataset_generator(table_file_name)

In [8]:
dataset_embeddings = embedding_func(questions, model)

The Length of the Dataset is: 153


In [11]:
def answer_to_question(question, answers):
  output = model.encode(question)
  output_embeddings = torch.FloatTensor(output)
  hits = semantic_search(output_embeddings, dataset_embeddings, top_k=4)
  my_answers = []
  pre_score = hits[0][0]['score'] + 0.001
  
  for answer_data in hits[0]:
    answer_score = answer_data['score']
    
    if (answer_score != pre_score) and (answer_score > 0.45) and ((pre_score - answer_score < 0.05) or ((answer_score > 0.7) and ((pre_score - answer_score < 0.04)))):
        my_answers.append(answers[answer_data['corpus_id']])
        pre_score = answer_score
    else:
        break

  output = ""
  if len(my_answers) > 0 :
    for answer in my_answers:
      output += answer + " "
    print(output)
    print('\n')
    print(hits)
  else: print("I don't know the answer. please modify your question.")


In [12]:
question = [input("Please ask your question: ")]
answer_to_question(question, answers)

Please ask your question:  what should I do before purchasing td-lte modem?


Before purchasing any TD-LTE modem, customers should check Irancell network coverage in the intended area using the "coverage map" available on the Irancell website. Before purchasing a mobile modem, the Customer should check Irancell network coverage in the intended area using the "coverage map" available on the Irancell website. 


[[{'corpus_id': 89, 'score': 0.7922075390815735}, {'corpus_id': 123, 'score': 0.7424969673156738}, {'corpus_id': 108, 'score': 0.6250380873680115}, {'corpus_id': 139, 'score': 0.6050375699996948}]]


In [18]:
answers[108]

'Customers can request a refund by contacting Irancell Call Center, either by dialing 707 from Irancell lines or 09377070000 from other lines, or by raising their request through online chat with Call Center agents.'