<a href="https://colab.research.google.com/github/BoyuanZhang0515/Chatbot-Project/blob/main/QA_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.capitalone.com/tech/machine-learning/how-to-finetune-sbert-for-question-matching/

In [1]:
!pip install sentence-transformers
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 742 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 40.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 32.0 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 54.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 14.6 MB/s 
Building wheels for collected 

In [2]:
# sentence-transformers==1.0.4, torch==1.7.0.
import spacy
import random
from collections import defaultdict
from sentence_transformers import SentenceTransformer, SentencesDataset
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader


# for complex model
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
from scipy import spatial

import warnings
warnings.simplefilter('ignore')

# **Pre-Trained the BERT Model**

In [None]:
def triplets_from_labeled_dataset(input_examples):
    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a positive instance with the same label and a negative instance with a different label
    """During each iteration of the fine-tuning process, we select an anchor vector v1 to focus on. We then select a positive and a negative data point for 
    comparison: v2 from the same group as v1 and v3 from a different group. We then minimize the distance between v1 and v2 (anchor and positive) while maximizing
     the distance between v1 and v3 (anchor and negative). The loss function for this optimization is known as triplet loss."""
     
    triplets = []
    label2sentence = defaultdict(list)
    for inp_example in input_examples:
        label2sentence[inp_example.label].append(inp_example)

    for inp_example in input_examples:
        anchor = inp_example

        if len(label2sentence[inp_example.label]) < 2: #We need at least 2 examples per label to create a triplet
            continue

        positive = None
        while positive is None or positive.guid == anchor.guid:
            positive = random.choice(label2sentence[inp_example.label])

        negative = None
        while negative is None or negative.label == anchor.label:
            negative = random.choice(input_examples)

        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets

In [None]:
# Load pre-trained model - we are using the original Sentence-BERT for this example.
sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Set up data for fine-tuning 
sentence_reader = LabelSentenceReader(folder='/content')

data_list = sentence_reader.get_examples(filename='Copy of Test_QA_data_ver2_0704  - Sheet1.tsv')
triplets = triplets_from_labeled_dataset(input_examples=data_list)
finetune_data = SentencesDataset(examples=triplets, model=sbert_model)
finetune_dataloader = DataLoader(finetune_data, shuffle=True, batch_size=16)

# Initialize triplet loss
loss = TripletLoss(model=sbert_model)

# Fine-tune the model
sbert_model.fit(train_objectives=[(finetune_dataloader, loss)], epochs=4, output_path='bert-base-nli-stsb-mean-tokens-rmds')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
#### import our default text
#### uploaded the file need to run the code
df = pd.read_csv('/content/Copy of Test_QA_data_ver2_0704  - Sheet1.tsv' , sep='\t', header=None)
colnames = ['number', 'question', 'a', 'b', 'c']
df_2 = pd.read_csv('/content/Test_QA_data_ver2_0704 (with category) - Sheet1.tsv', sep='\t', header=0, names=colnames)

# import the trained model
rmds_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens-rmds')

# select the default answer
q_tem = df.groupby(0).apply(lambda x: x.iloc[:-1] if len(x)>1 else x).reset_index(drop=True)
answer_tem = df.groupby(0).apply(lambda x: x.iloc[-1] if len(x)>1 else x).reset_index(drop=True)
corresponding_answer = pd.merge(left=q_tem, right=answer_tem, how="left", left_on=0, right_on=0)['1_y'].tolist()

# same filter for question number
q_num = q_tem.iloc[:,0].tolist()

# select the default questions
question_text = df.groupby(0).apply(lambda x: x.iloc[:-1] if len(x)>1 else x).reset_index(drop=True)[1].tolist()

#question_text are encoded by calling model.encode()
question_embeddings = rmds_model.encode(question_text)

#Print the embeddings
#for sentence, embedding in zip(question_text, question_embeddings):
    #print("Sentence:", sentence)
    #print("Embedding:", embedding)
    #print("###################################")
    #print("")

In [None]:
######## auto input user questions

# input user question
new_question = """where to find rmds?"""
## embedding question
encoded_question = rmds_model.encode([new_question])

## calculate the distance
distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

for idx, distance in results[0:3]: # just getting top 1
    if 1-distance>0.5:
      print(f"Your answer:")
      # print(question_texts[idx])
      # print(f"cosine_score:{1-distance}, {corresponding_answer[idx]}")
      print(corresponding_answer[idx])
      break
    else:
      print(f"Sorry I don't understand this, but do you mean to ask this: {question_text[idx]}")
      break

# **QA_bot part**

In [None]:
#### list of conversations that would pop up randomly throughout the conversation
backup_qlist = [ 
"""
Do you know we also offer a wide range of courses, 
starting from data science, business analytics, all the way to computer science and AI related. \n
No matter if you are just beginners who are interested in the filed of data analysis, 
or professionals who intend to puruse in computational finance, you should defnitely check it out at the courses offered at RMDS! \n
-----------------------------------------------------------------------------------------------\n
You can type 'what courses do you have' below to ask for more information!""",
""" 
Want to attend some fun events while enriching your understandings in data and technologies? We've got a summer refresher for you!
Let's meet with some aswesome speakers and enthusiasts at the IMDATA 2022 conference! \n
Here you could have a taste in the combo of NFT and cryoptograph, take a look at ultilizing AI in data sciences. 
You can type 'IMDATA 2022' below to learn more about this event you defnitely shouldn't miss! \n""", 
"""
Do you know, through our NFT marketplace, our community members have access to tools and 
resources that enhance their research and increase the success rate of their work. \n
Haven't heard about NFT or our marketplace? Definitely give it a look by typing 'NFT' or 'NFT Marketplace' below to ask me more!""",
"""
Do you know, our Monthly Membership offers \n
1)Premier tools for individual scientists to excel, collaborate and succeed in their fields! \n
2)Free Ticket to IM Data annual conference! \n
3)Unlimited Access to RMDS Tutorials and Recorded Videos! \n
And EVEN MORE waiting you to find out yourself!!!\n
-----------------------------------------------------------------------------------------------\n
Check out at this website to learn more! https://grmds.org/membership-pricing""",
"""
Are you a data scientist who's also interested in some finance? Keep up with the new chic! NFT!!! \n
Considered as the modern-day collectibles, the Non-fungible token (NFT) have now been applied to various fields, including science, technology, and art! \n
Have you not heard about NFT? Let us help you! 
-----------------------------------------------------------------------------------------------\n
Type down 'what is NFT Market' below to learn more!""",
"""
Want to get to learn about data science and AI related courses but don't know where to find the right resources?  
check out some of our free recordings that provides you a fun play around! Some topics include Applications of AI in Media and Entertainment,
Spatial Data Analysis and Visualization, and Data Science Ecosystem Approach and Innovative Methods​! \n
Type 'Webinar Series course' below to get access to these FREE webinars!""",
"""
Seeking for a place to discuss and share your thoughts with others who are also interested in data science? 
Come to our IMDATA conference every summer to have a tast at AI developments, computer vision and deep learning. \n
Just missed it? Hold on, the Quaterly Competition also welcomes data enthusiasts for research and analysis, which allow everyone to win up to $2000! \n
-----------------------------------------------------------------------------------------------\n
Type 'IMDATA' or "Quaterly Competition' or just 'event' below to learn more!"""]

In [None]:
#StopWords function
## add points if word appears in category b
## subtract points if words are too 'weird' for making a question
nlp = spacy.load('en_core_web_sm')

def CleanStopWords(question):
  cate_B_list = [str(x).lower().split(' /') for x in df_2['b']]
  list_B = []
  hate_word_list = ['hate','detestation','disgust','enmity','hate','hatred','horror','loath','malice','disgust','revolt']
  for i in cate_B_list:
    for j in i:
      if j not in list_B and j != 'nan':
        list_B.append(j)
  StopWords = ['is','are','be','should','does','do','can','that','to','for','on','at','from','by','and', 'in']
  line = nlp(question)
  question_tokens = [str(token).lower() for token in line if str(token) not in StopWords]
  bonus_count = 0
  hate_count = 0
  bonus_list = []
  for i in range(len(question_tokens)):
    
    if question_tokens[i] in list_B:
      bonus_list.append(question_tokens[i])
      bonus_count+=1
      
    elif i < len(question_tokens)-1:
      if question_tokens[i]+' '+question_tokens[i+1] in list_B:
        bonus_list.append(question_tokens[i]+' '+question_tokens[i+1])
        bonus_count+=1
        
    

  for token in question_tokens:
    if token in hate_word_list:
      hate_count+=1
  return ' '.join(question_tokens), bonus_count, bonus_list, hate_count 

#CleanStopWords('I like big data!')

In [None]:
# question cannot be identified and user choose from bonus_list
def QA_special_bonus(cat_b):
  #use the bot again to find all relevant question under this category
  encoded_question = rmds_model.encode([cat_b])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  ## getting index 
  idx = results[0][0]
  category_b = [x for x in df_2.b.tolist() if pd.isna(x)==False][idx]

  relevant_q = df_2.loc[(df_2['b'] == category_b)].iloc[:, 0:2]
  relevant_q = relevant_q.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,1].values.tolist()
  return relevant_q

In [None]:
# question cannot be identified (special case)
def QA_special(answer_text, response, relevant_q, bonus_list):
    flag_1 = True
    flag_keyword = False
    if len(bonus_list) != 0:
      flag_keyword = True 
    while flag_1:
    # input user question

      # correctly predict user's intent question + provide answers
      if flag_keyword == False and response in ["Yes", "Y", "yes", "y", "yep", "yea", "yeah", "YES"]: 
        print(f'\n{answer_text}')
        print(f"\nDo you want to know more about the following questions? You can type the question number, other questions you have or 'no' to end our conversation\n")
        
        number = 1
        for question in relevant_q:
          if number == 3:
            print("******** Here are more you might want to know ********\n")
          print(f'{number}. {question}\n')
          number += 1
        flag_1 = False
        return relevant_q
      
      # fail to predict user's intent question
      elif flag_keyword == False and response in ["No", "NO", "no", "nope", "nah", "N", "n"]: # suggest further assistance
        print("Sorry I don't understand your question, but you can email us at info@rmdslab.com or give us a call at 1-626-432-7266 for more information!\n")
        flag_1 = False
        return -1

      # user choose from bonus_list 
      elif len(bonus_list) != 0 and response in bonus_list: 
        print("Please type in number to choose from the following questions. \n")
        relevant_q = QA_special_bonus(response)

        number = 1
        for question in relevant_q:
          print(f'{number}. {question}\n')
          number += 1
        flag_1 = False
        return relevant_q
      
      else: #invalid input from user
        response = input("Please try input again. You can try type in 'yes' or 'no' or choose from the keywords provided.\n")

In [None]:
# convert it to a function
def QA_bot(input_question,question_embeddings, corresponding_answer):
  # input user question
  new_question, bonus_count, bonus_list, hate_count = CleanStopWords(input_question)
  bonus_list_flatten = ' / '.join(bonus_list)
  ## embedding question
  encoded_question = rmds_model.encode([new_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  ## getting index 
  idx = results[0][0]

  ## getting category b
  category_b = [x for x in df_2.b.tolist() if pd.isna(x)==False][idx]

  ## getting category c (if non-null: null = -1)
  category_c = [x for x in df_2.c.tolist() if pd.isna(x)==False][idx]
  flag_cat_c = False
  if category_c.isnumeric() == False:  ## does have category c
    flag_cat_c = True

  ## getting question number
  question_num = q_num[idx]

  ## getting relevant questions 
  relevant_q = df_2.loc[(df_2['b'] == category_b) & (df_2['number'] != question_num)].iloc[:, 0:2]
  relevant_q = relevant_q.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,1].values.tolist()
  if len(relevant_q) >= 3 and flag_cat_c==True: ## if has category c, cap relevant question based on category b to 2
    relevant_q = relevant_q[:3]

  ## getting question from category c (if possible)
  if flag_cat_c:
    more_q = df_2.loc[(df_2['c'] == category_c) & (df_2['b'] != category_b)].iloc[:, 0:2]
    more_q = more_q.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,1].values.tolist()[:2]

    for question in more_q:
      relevant_q.append(question)

  
  ##########################################################
  for idx, distance in results[0:1]: # just getting top 1 answer
      ## identify bonus / hate keywords here
      if bonus_count > 0:
        distance -= bonus_count * 0.15
      if hate_count > 0:
        distance += hate_count * 0.5
      
      if distance < 0.2:   # pass the test and return the answer
        print(corresponding_answer[idx])
        number = 1
        print(f"\nDo you want to know more about the following questions? You can type the question number, other questions you have or 'no' to end our conversation\n")
        for question in relevant_q:
          if number == 4:
            print("******** Here are more you might want to know ********\n")
          print(f'{number}. {question}\n')
          number += 1

        ## randomly pop up topics to direct conversation to other directions at 25% probability
        if random.randint(1,4) == 4 and len(backup_qlist) != 0:
          print(backup_qlist.pop(random.randint(0, len(backup_qlist)-1)))
        return relevant_q

      else: # doesn't find a match answer
        if len(bonus_list) == 0:
          response = input(f"""\nSorry I don't understand this, but do you mean to ask this: {question_text[idx]}\n""")
        else:
          response = input(f"""Sorry I don't understand this, but did you want to know more about these keywords: {bonus_list_flatten}?\n 
You can type in the keywords to choose from the list provided. \n""")
        return QA_special(corresponding_answer[idx], response, relevant_q, bonus_list)

#QA_bot("nft awesome wowwww!", question_embeddings, corresponding_answer)

In [None]:
############## bert similar version ##############
question = input("\nPlease enter your question: \n")

while True:
  # question_answer(question)
  result = QA_bot(question, question_embeddings, corresponding_answer)

  flag = True
  flag_N = False
  while flag:
    if result == -1:  # question cannot be identified
      response = input("\nDo you have other question? You can type your question as below\n")
      while not (chr.isalpha() or chr.isspace() or chr=='?' for chr in response):
        response = input("Please try input again. You can try typing your own question here.\n")
    
    else:  # user should type in number to choose
      response = input(f"\n")
      # user choose relevant question
      if response.isnumeric():
        while not (0 < int(response) and int(response) <= len(result)): 
          response = input(f"Please try input again. You can type in numbers from 1 to {len(result)}.\n")
        question = result[int(response)-1]
        break

    # user wants to end conversation
    if response in ["No", "no","nothing", "that's it","N"]: #user want to end
        print("\nSee you next time!")
        flag = False
        flag_N = True
        break    
    
    # user correctly type in their own question
    question = response
    flag = False
  if flag_N == True:
    break


Please enter your question: 
nft  awesome woww i love it
Sorry I don't understand this, but did you want to know more about these keywords: nft?
 
You can type in the keywords to choose from the list provided. 
jdgkl
Please try input again. You can try type in 'yes' or 'no' or choose from the keywords provided.
yes
Please try input again. You can try type in 'yes' or 'no' or choose from the keywords provided.
nft
Please type in number to choose from the following questions. 

1. what is NFT?

2. why should I use NFT?

3. where can I learn more about NFT?


2
NFTs have been applied to various fields, including science, technology, and art. With NFTs becoming more popular, RMDS Lab is announcing its next data science competition - Creating a NFT Value Dashboard to Summarize the Past and Predict the Future. Contestants will be challenged with collecting datasets to create a dashboard that assesses trends in the development of NFTs and forecast their future trajectory.

Do you want to kno

KeyboardInterrupt: ignored

**

Challenges faced:
1. What is the actual boundary value for this model employed currently (only manually tested the value; no model built and train to really construct a more precise cut-off value; 

  Potential solution: need to investigate how distance (similarity score) under the current Bert model works / build a new tfidf model 
2. Hard to identify the intention of the user; currently have the bonus and hate count to distinguish if the user is really trying to ask about something / just randomly typing irrelevant things; 

  Potential solution: develop list similar to the current hate_count (e.g. a question_word list to find if there’s match between category A and every single word in the input) 
**

## BERT_QA bot from Multiple_Texts

In [None]:
#  to get answer from an array of passages
def get_answer(q, p_array):
    score_list = []
    ans_list = []
    j_list = []
    for j in range (len(p_array)):  
        #p = preprocess(p_array[j] )
        p = p_array[j] 

        start, end , start_score, end_score,  ans = bert_answering_machine (q, p)
        #print( '\nText num:', j, 'Score:', start_score, end_score, '\nBERT Answer:', ans)
        
        if (start != 0) and (start_score > 0.25)  and (ans != '[SEP]')  :
            score_list.append(str(start_score) + ' and ' + str(end_score))
            ans_list.append(ans)
            j_list.append(j)
        else:
            text_num = None
            token_scores = None
            answer = "No Answer From BERT"

            
    if len(score_list) > 0 :
        ind = np.argmax(score_list)
        #print( 'Text number:', j_list[ind], ',  Token Scores:', score_list[ind], '\nBERT Answer:', ans_list[ind])
        text_num = j_list[ind]
        token_scores = score_list[ind]
        answer = ans_list[ind]
    else:
        text_num = None
        token_scores = None
        answer = "No Answer From BERT"
    return text_num, token_scores, answer

In [None]:
# passing 3 pagges and get_answer gets the  answer from best passege
passages_array=["I am a student , I study in UC Davis. I like to play Tennis",
    "John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis", 
 "My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington, My sister is Kelly. " ]

for i in range(len(passages_array)):
    print (f'Passage: {i} : {passages_array[i]}\n')
question ="Which college does John's sister attend"  

passage_num, scores, answer = get_answer(question, passages_array)

print (f'The question: {question} \n Answer: {answer} , Passage Index Where Answer Was Found: {passage_num}, Scores: {scores}')

Passage: 0 : I am a student , I study in UC Davis. I like to play Tennis

Passage: 1 : John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis

Passage: 2 : My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington, My sister is Kelly. 

The question: Which college does John's sister attend 
 Answer: uc davis , Passage Index Where Answer Was Found: 1, Scores: 5.83 and 6.35


## Document Segmenting

Now that we have our document corpus, we need to make sure that each document is short enough to fit into the 512 token limit of BERT. If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.


In [None]:
def segment_documents(docs, max_doc_length=450):
  # List containing full and segmented docs
  segmented_docs = []

  for doc in docs:
    # Split document by spaces to obtain a word count that roughly approximates the token count
    split_to_words = doc.split(" ")

    # If the document is longer than our maximum length, split it up into smaller segments and add them to the list 
    if len(split_to_words) > max_doc_length:
      for doc_segment in range(0, len(split_to_words), max_doc_length):
        segmented_docs.append( " ".join(split_to_words[doc_segment:doc_segment + max_doc_length]))

    # If the document is shorter than our maximum length, add it to the list
    else:
      segmented_docs.append(doc)

  return segmented_docs

In [None]:
# contents = segment_documents(df_long_text['Content'].to_list())
# contents

## Finding Relevant Documents

Next, our goal is to find within this corpus the subset of documents that are most likely to contain our answer, because running every single document through our BERT model is expensive and doesn't help us narrow down a good answer. For this example, we'll simply use the scikit-learn TF-IDF vectorizer to convert our documents and our query into vectors. 

The document vectors with the highest cosine similarity to our query vector will be the best candidates to search for our answer, and we will feed these top candidate documents into the SQUAD model to get our predicted answers.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_articles(query, docs, k=2):

  # Initialize a vectorizer that removes English stop words
  vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

  # Create a corpus of query and documents and convert to TFIDF vectors
  query_and_docs = [query] + docs
  matrix = vectorizer.fit_transform(query_and_docs)

  # Holds our cosine similarity scores
  scores = []

  # The first vector is our query text, so compute the similarity of our query against all document vectors
  for i in range(1, len(query_and_docs)):
    scores.append(cosine_similarity(matrix[0], matrix[i])[0][0])

  # Sort list of scores and return the top k highest scoring documents
  sorted_list = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
  top_doc_indices = [x[0] for x in sorted_list[:k]]
  top_docs = [docs[x] for x in top_doc_indices]
  
  return top_docs

In [None]:
##### test
# myself test based on own csv file
df_long_text = pd.read_excel("(for test)[Chatbot] Content - Copy.xlsx")

# Segment our documents (If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.)
contents = segment_documents(df_long_text['Content'].to_list())

# question ="when was nba founded?"  
question ="what is nba?"



# Retrieve the top k most relevant documents to the query ()
candidate_docs = get_top_k_articles(question, contents, 2)

passage_num, scores, answer = get_answer(question, candidate_docs)
print(f'Answer: {answer}')

# print (f'Answer: {answer} , Passage Index Where Answer Was Found: {passage_num}, Scores: {scores}')

FileNotFoundError: [Errno 2] No such file or directory: '(for test)[Chatbot] Content - Copy.xlsx'

In [None]:
### convert it to function:
def bert_text_search_start_end_QA(que, documents):
  # Segment our documents (If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.)
  contents = segment_documents(documents)

  question =que

  # Retrieve the top k most relevant documents to the query ()
  candidate_docs = get_top_k_articles(question, contents, 2)

  passage_num, scores, answer = get_answer(question, candidate_docs)
  return answer


# Combination version (version02)!!!!!!!!!

In [None]:
df_long_text = pd.read_excel("(for test)[Chatbot] Content - Copy.xlsx")
contents = segment_documents(df_long_text['Content'].to_list())

In [None]:
# convert it to a function
def QA_bot(input_question,question_embeddings, corresponding_answer):
  ## embedding question
  encoded_question = rmds_model.encode([input_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  for idx, distance in results[0:1]: # just getting top 1
      print(f"Your answer:")
      print(corresponding_answer[idx])

In [None]:
###################### test version(convert it to function) ######################
def combination_model(input_question, question_embeddings, corresponding_answer):
  ## embedding question
  encoded_question = rmds_model.encode([input_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  for idx, distance in results[0:1]: # just getting top 1
      if 1-distance>0.5:
        print(f"Your answer:")
        # print(question_texts[idx])
        print(f"cosine_score:{1-distance}, {corresponding_answer[idx]}")
      else:
        bert_text_search_start_end_QA(input_question, contents)
        print(f'Answer: {answer}')

In [None]:
############## combination QA chatbot version!!!!!!!!!!!!!!!!!!!!!!!! ##############

question = input("\nPlease enter your question: \n")

while True:
  # question_answer(question)
  combination_model(question, question_embeddings, corresponding_answer)

  flag = True
  flag_N = False
  while flag:
    response = input("\nDo you have other questions? \n ")
    if response not in ["No", "no","not thing", "that's it","N"]:
      # question = input("\nPlease enter your question: \n")
      question = response
      flag = False
    else:
      print("\nSee you next time!")
      flag = False
      flag_N = True
  if flag_N == True:
    break


Please enter your question: 
nba
Answer: the national basketball association

Do you have other questions? 
 no

See you next time!
