https://www.capitalone.com/tech/machine-learning/how-to-finetune-sbert-for-question-matching/

In [1]:
!pip install sentence-transformers
!pip install transformers
!pip install torch

Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
Collecting torchvision
  Using cached torchvision-0.13.1-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1-cp39-cp39-win_amd64.whl (3.3 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers, torchvision, sentencepiece, sentence-transformers
Successfully installed huggingface-hub-0.9.1 sentence-transformers-2.2.2 sentencepiece-0.1.97 tokenizers-0.12.1 torchvision-0.13.1 transformers-4.21.2


In [4]:
# sentence-transformers==1.0.4, torch==1.7.0.
import spacy
import random
from collections import defaultdict
from sentence_transformers import SentenceTransformer, SentencesDataset
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader


# for complex model
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd
from scipy import spatial

import warnings
warnings.simplefilter('ignore')

ModuleNotFoundError: No module named 'spacy'

In [3]:
def triplets_from_labeled_dataset(input_examples):
    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a positive instance with the same label and a negative instance with a different label
    """During each iteration of the fine-tuning process, we select an anchor vector v1 to focus on. We then select a positive and a negative data point for 
    comparison: v2 from the same group as v1 and v3 from a different group. We then minimize the distance between v1 and v2 (anchor and positive) while maximizing
     the distance between v1 and v3 (anchor and negative). The loss function for this optimization is known as triplet loss."""
     
    triplets = []
    label2sentence = defaultdict(list)
    for inp_example in input_examples:
        label2sentence[inp_example.label].append(inp_example)

    for inp_example in input_examples:
        anchor = inp_example

        if len(label2sentence[inp_example.label]) < 2: #We need at least 2 examples per label to create a triplet
            continue

        positive = None
        while positive is None or positive.guid == anchor.guid:
            positive = random.choice(label2sentence[inp_example.label])

        negative = None
        while negative is None or negative.label == anchor.label:
            negative = random.choice(input_examples)

        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets

In [None]:
# Load pre-trained model - we are using the original Sentence-BERT for this example.
sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Set up data for fine-tuning 
sentence_reader = LabelSentenceReader(folder='/content')

data_list = sentence_reader.get_examples(filename='Copy of Test_QA_data_ver2_0704  - Sheet1.tsv')
triplets = triplets_from_labeled_dataset(input_examples=data_list)
finetune_data = SentencesDataset(examples=triplets, model=sbert_model)
finetune_dataloader = DataLoader(finetune_data, shuffle=True, batch_size=16)

# Initialize triplet loss
loss = TripletLoss(model=sbert_model)

# Fine-tune the model
sbert_model.fit(train_objectives=[(finetune_dataloader, loss)], epochs=4, output_path='bert-base-nli-stsb-mean-tokens-rmds')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

Iteration:   0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
#### changes made here
#### import our default text
df = pd.read_csv('/content/Copy of Test_QA_data_ver2_0704  - Sheet1.tsv' , sep='\t', header=None)
colnames = ['number', 'question', 'a', 'b', 'c']
df_2 = pd.read_csv('/content/Test_QA_data_ver2_0704 (with category) - Sheet1.tsv', sep='\t', header=0, names=colnames)

# import the trained model
rmds_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens-rmds')

# select the default answer
q_tem = df.groupby(0).apply(lambda x: x.iloc[:-1] if len(x)>1 else x).reset_index(drop=True)
answer_tem = df.groupby(0).apply(lambda x: x.iloc[-1] if len(x)>1 else x).reset_index(drop=True)
corresponding_answer = pd.merge(left=q_tem, right=answer_tem, how="left", left_on=0, right_on=0)['1_y'].tolist()

# same filter for question number
q_num = q_tem.iloc[:,0].tolist()

# select the default questions
question_text = df.groupby(0).apply(lambda x: x.iloc[:-1] if len(x)>1 else x).reset_index(drop=True)[1].tolist()

#question_text are encoded by calling model.encode()
question_embeddings = rmds_model.encode(question_text)

#Print the embeddings
#for sentence, embedding in zip(question_text, question_embeddings):
    #print("Sentence:", sentence)
    #print("Embedding:", embedding)
    #print("###################################")
    #print("")

In [None]:
 ## getting categories 
 ## grouped by question number
 ## have duplicates if different question number shares same category

cat_a = df_2.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,2].values.tolist()
cat_b = df_2.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,3].values.tolist()
cat_c = df_2.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,4].values.tolist()

## non-dup category list as below

cat_a_no_dup = [*set(cat_a)]
cat_b_no_dup = [*set(cat_b)]
cat_c_no_dup = [*set(cat_c)]

In [None]:
######## auto input user questions

# input user question
new_question = """where to find rmds?"""
## embedding question
encoded_question = rmds_model.encode([new_question])

## calculate the distance
distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

for idx, distance in results[0:3]: # just getting top 1
    if 1-distance>0.5:
      print(f"Your answer:")
      # print(question_texts[idx])
      # print(f"cosine_score:{1-distance}, {corresponding_answer[idx]}")
      print(corresponding_answer[idx])
      break
    else:
      print(f"Sorry I don't understand this, but do you mean to ask this: {question_text[idx]}")
      break

Your answer:
To learn more about us, please visit our website: https://grmds.org/


In [None]:
backup_qlist = [ 
    """
      Do you know we also offer a wide range of courses, 
      starting from data science, business analytics, all the way to computer science and AI related. \n
      No matter if you are just beginners who are interested in the filed of data analysis, 
      or professionals who intend to puruse in computational finance, you should defnitely check it out at the courses offered at RMDS! \n
      You can type 'what courses do you have' below to ask for more information!""",
    """ 
      Want to attend some fun events while enriching your understandings in data and technologies? We've got a summer refresher for you!
      Let's meet with some aswesome speakers and enthusiasts at the IMDATA 2022 conference! \n
      Here you could have a taste in the combo of NFT and cryoptograph, take a look at ultilizing AI in data sciences. 
      You can type 'IMDATA 2022' below to learn more about this event you defnitely shouldn't miss! \n""", 
    """
        Did you know, through our NFT marketplace, our community members have access to tools and 
        resources that enhance their research and increase the success rate of their work. \n
        Haven't heard about NFT or our marketplace? Definitely give it a look by typing 'NFT' or 'NFT Marketplace' below to ask me more!""",
    """
        promote membership 
     """,
     """
        introduce nft
     """,
     """
        webinars (there are free recordings available)
     """,
     """
        competition / major event (find latest)
     """,]

In [None]:
#StopWords function
## add points if word appears in category b
## subtract points if words are too 'weird' for making a question
nlp = spacy.load('en_core_web_sm')

def CleanStopWords(question):
  cate_B_list = [str(x).lower().split(' /') for x in df_2['b']]
  list_B = []
  hate_word_list = ['hate','detestation','disgust','enmity','hate','hatred','horror','loath','malice','disgust','revolt']
  for i in cate_B_list:
    for j in i:
      if j not in list_B and j != 'nan':
        list_B.append(j)
  StopWords = ['is','are','be','should','does','do','can','that','to','for','on','at','from','by','and', 'in']
  line = nlp(question)
  question_tokens = [str(token).lower() for token in line if str(token) not in StopWords]
  bonus_count = 0
  hate_count = 0
  bonus_list = []
  for i in range(len(question_tokens)):
    
    if question_tokens[i] in list_B:
      bonus_list.append(question_tokens[i])
      bonus_count+=1
      
    elif i < len(question_tokens)-1:
      if question_tokens[i]+' '+question_tokens[i+1] in list_B:
        bonus_list.append(question_tokens[i]+' '+question_tokens[i+1])
        bonus_count+=1
        
    

  for token in question_tokens:
    if token in hate_word_list:
      hate_count+=1
  return ' '.join(question_tokens), bonus_count, bonus_list, hate_count 

CleanStopWords('I like big data!')

('i like big data !', 1, ['big data'], 0)

In [None]:
# question cannot be identified (special case)
def QA_special(answer_text, response, relevant_q):
    flag_1 = True
    while flag_1:
    # input user question
      if response in ["Yes", "Y", "yes", "y", "yep", "yea", "yeah", "YES"]: #provide answers
        print(f'\n{answer_text}')
        print(f"\nDo you want to know more about the following questions? You can type the question number, other questions you have or 'no' to end our conversation\n")
        
        number = 1
        for question in relevant_q:
          if number == 3:
            print("******** Here are more you might want to know ********\n")
          print(f'{number}. {question}\n')
          number += 1
        flag_1 = False
        return relevant_q
      
      elif response in ["No", "NO", "no", "nope", "nah", "N", "n"]: # suggest further assistance
        print("Sorry I don't understand your question, but you can email us at info@rmdslab.com or give us a call at 1-626-432-7266 for more information!\n")
        flag_1 = False
        return -1
      
      else: #invalid input from user
        response = input("Please try input again. You can try type in 'yes' or 'no'.\n")

In [None]:
cate_B_list = [str(x).lower().split(' /') for x in df_2['b']]
list_B = []
for i in cate_B_list:
  for j in i:
    if j not in list_B and j != 'nan':
      list_B.append(j)
list_B

['rmds',
 ' mission',
 ' goal',
 'service',
 ' product',
 'study',
 ' course',
 ' class',
 ' training',
 'event ',
 'event',
 'project portal',
 'data science portfolio',
 'rmds exchange',
 'nft market',
 'rm4es',
 ' workflow',
 'researchmap',
 'ecosystem',
 'big data',
 ' ai',
 ' business',
 ' data science',
 'financial technology',
 ' cryptocurrency',
 'ai bias',
 ' surveillance',
 'data analysis',
 'cdi',
 'ai',
 ' ethics',
 ' decision making',
 'webinar series',
 'covid',
 ' literature review',
 ' model',
 'python',
 'knime',
 ' basic',
 'chatbot',
 ' healthcare',
 'machine learning',
 ' ml',
 'grmds',
 'imdata',
 'quaterly competition',
 'nft',
 'nft exchange',
 'upskill program',
 ' pi program',
 ' internship',
 'impact score',
 'recommendation system',
 'practical data science training',
 'pre-conference training']

In [None]:
'big data' in list_B

True

In [None]:
# convert it to a function
def QA_bot(input_question,question_embeddings, corresponding_answer):
  # input user question
  new_question, bonus_count, bonus_list, hate_count = CleanStopWords(input_question)
  bonus_list_flatten = ' '.join(bonus_list)
  ## embedding question
  encoded_question = rmds_model.encode([new_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  print(results[0:3])

  ## getting index 
  idx = results[0][0]

  ## getting category b
  category_b = [x for x in df_2.b.tolist() if pd.isna(x)==False][idx]

  ## getting category c (if non-null: null = -1)
  category_c = [x for x in df_2.c.tolist() if pd.isna(x)==False][idx]
  flag_cat_c = False
  if category_c.isnumeric():
    flag_cat_c = True

  ## getting question number
  question_num = q_num[idx]

  ## getting relevant questions 
  relevant_q = df_2.loc[(df_2['b'] == category_b) & (df_2['number'] != question_num)].iloc[:, 0:2]
  relevant_q = relevant_q.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,1].values.tolist()
  if len(relevant_q) >= 2 and flag_cat_c==True:
    relevant_q = relevant_q[:2]

  ## getting question from category c (if possible)
  if flag_cat_c:
    more_q = df_2.loc[(df_2['c'] != -1) & (df_2['c'] == category_c) & (df_2['b'] != category_b)].iloc[:, 0:2]
    more_q = more_q.groupby(['number']).apply(lambda x: x.iloc[0]).reset_index(drop=True).iloc[:,1].values.tolist()[:2]

    for question in more_q:
      relevant_q.append(question)
  
  ## pull up from the backup list
  '''
  while len(relevant_q) < 15:
    question = backup_qlist[random.randint(0,8)]
    while question in relevant_q or question == input_question:
      question = backup_qlist[random.randint(0,8)]
    relevant_q.append(question)
'''
  
  ##########################################################
  for idx, distance in results[0:1]: # just getting top 1
      if bonus_count > 0:
        distance -= bonus_count * 0.3
        print(distance)
      if hate_count > 0:
        distance += hate_count * 0.5
      
      ############### test on this boundary value
      ##### use all questions from the sheet to find max / min score it could achieve
      #### when should the chatbot reply
      
      if distance < 0.3:   # print the answer
        print(corresponding_answer[idx])
        number = 1
        print(f"\nDo you want to know more about the following questions? You can type the question number, other questions you have or 'no' to end our conversation\n")
        for question in relevant_q:
          if number == 3:
            print("******** Here are more you might want to know ********\n")
          print(f'{number}. {question}\n')
          number += 1

        ## randomly pop up topics to direct conversation to other directions 25% probability
        if random.randint(1,4) == 4 and len(backup_qlist) != 0:
          print(backup_qlist.pop(random.randint(0, len(backup_qlist))))
        return relevant_q

      else:
        response = input(f"""\nSorry I don't understand this, but do you mean to ask this : {question_text[idx]}\nOR\nDid you want to know more about: {bonus_list_flatten}\n""")
        return QA_special(corresponding_answer[idx], response, relevant_q)

QA_bot("AI!", question_embeddings, corresponding_answer)

[(371, 0.3004916874134952), (372, 0.33563935863140504), (395, 0.4289983173409876)]
0.000491687413495201
The followings are the courses we have: 1. Financial Technology, Cryptocurrency & AI  2. AI Bias & Surveillance: Recognition, Analysis, and Prediction  3. AI & Ethics: Bias, Diversity and Ethical Decision Making with AI Systems  4.  Creation of Specialized AI Assistant for Healthcare Industry. 

Do you want to know more about the following questions? You can type the question number, other questions you have or 'no' to end our conversation

1. where could I find course about AI?



['where could I find course about AI?']

In [5]:
############## bert similar version ##############
question = input("\nPlease enter your question: \n")

while True:
  # question_answer(question)
  result = QA_bot(question, question_embeddings, corresponding_answer)

  flag = True
  flag_N = False
  while flag:
    if result == -1:  # question cannot be identified
      response = input("\nDo you have other question? You can type your question as below\n")
      while not (chr.isalpha() or chr.isspace() or chr=='?' for chr in response):
        response = input("Please try input again. You can try typing your own question here.\n")
    
    else:  # user should type in number to choose
      response = input(f"\n")
      # user choose relevant question
      if response.isnumeric():
        while not (0 < int(response) and int(response) <= len(result)): 
          response = input(f"Please try input again. You can type in numbers from 1 to {len(result)}.\n")
        question = result[int(response)-1]
        break

    # user wants to end conversation
    if response in ["No", "no","nothing", "that's it","N"]: #user want to end
        print("\nSee you next time!")
        flag = False
        flag_N = True
        break    
    
    # user correctly type in their own question
    question = response
    flag = False
  if flag_N == True:
    break


Please enter your question: 
dd


NameError: name 'QA_bot' is not defined

In [None]:
##########################################################################################################

In [None]:
q_tem

Unnamed: 0,0,1,2,3,4
0,1,What is RMDS Lab?,who are we,RMDS / mission / goal,grmds
1,1,What is RMDS?,who are we,RMDS / mission / goal,grmds
2,1,RMDS Lab is what?,who are we,RMDS / mission / goal,grmds
3,1,who are you?,who are we,RMDS / mission / goal,grmds
4,1,RMDS Lab?,who are we,RMDS / mission / goal,grmds
...,...,...,...,...,...
502,188,how does Pre-Conference Training help me?,why,Pre-Conference Training,-1
503,188,what can I benefit from Pre-Conference Training?,why,Pre-Conference Training,-1
504,188,what can I benefit from Pre-Conference Training?,why,Pre-Conference Training,-1
505,189,where can I learn more about Pre-Conference Tr...,where,Pre-Conference Training,-1


In [None]:
df_2 = pd.read_csv('/content/Test_QA_data_ver2_0704 (with category) - Sheet1.tsv', sep='\t', header=None)
df_2

Unnamed: 0,0,1,2,3,4
0,1,What is RMDS Lab?,who are we,RMDS / mission / goal,grmds
1,1,What is RMDS?,who are we,RMDS / mission / goal,grmds
2,1,RMDS Lab is what?,who are we,RMDS / mission / goal,grmds
3,1,who are you?,who are we,RMDS / mission / goal,grmds
4,1,RMDS Lab?,who are we,RMDS / mission / goal,grmds
...,...,...,...,...,...
691,188,what can I benefit from Pre-Conference Training?,why,Pre-Conference Training,-1
692,188,The Pre-conferencel training offers you a bett...,,,
693,189,where can I learn more about Pre-Conference Tr...,where,Pre-Conference Training,-1
694,189,where to find out Pre-Conference Training?,where,Pre-Conference Training,-1


In [None]:
answer_tem

Unnamed: 0,0,1,2,3,4
0,1,Global Association for Research Methods and Da...,,,
1,2,We offer our user a global platform which enab...,,,
2,3,"To learn more about us, please visit our websi...",,,
3,4,"RMDS is a pioneer of impact scoring, and devel...",,,
4,5,we have a lot of services such as GRMDS Ecosys...,,,
...,...,...,...,...,...
184,185,Please check out our websites for more details...,,,
185,186,"Prior to the annually IMDATA conference, we of...",,,
186,187,The Pre-conference is for anyone in need to be...,,,
187,188,The Pre-conferencel training offers you a bett...,,,


In [None]:
pd.merge(left=q_tem, right=answer_tem, how="left", left_on=0, right_on=0)

Unnamed: 0,0,1_x,2_x,3_x,4_x,1_y,2_y,3_y,4_y
0,1,What is RMDS Lab?,who are we,RMDS / mission / goal,grmds,Global Association for Research Methods and Da...,,,
1,1,What is RMDS?,who are we,RMDS / mission / goal,grmds,Global Association for Research Methods and Da...,,,
2,1,RMDS Lab is what?,who are we,RMDS / mission / goal,grmds,Global Association for Research Methods and Da...,,,
3,1,who are you?,who are we,RMDS / mission / goal,grmds,Global Association for Research Methods and Da...,,,
4,1,RMDS Lab?,who are we,RMDS / mission / goal,grmds,Global Association for Research Methods and Da...,,,
...,...,...,...,...,...,...,...,...,...
502,188,how does Pre-Conference Training help me?,why,Pre-Conference Training,-1,The Pre-conferencel training offers you a bett...,,,
503,188,what can I benefit from Pre-Conference Training?,why,Pre-Conference Training,-1,The Pre-conferencel training offers you a bett...,,,
504,188,what can I benefit from Pre-Conference Training?,why,Pre-Conference Training,-1,The Pre-conferencel training offers you a bett...,,,
505,189,where can I learn more about Pre-Conference Tr...,where,Pre-Conference Training,-1,Please check our websites for more details: ht...,,,


In [None]:
corresponding_answer

['Global Association for Research Methods and Data Science is the leading community-centered data science research organization, creating a global platform which enables people to meet, learn, and collaborate with fellow data science professionals to solve problems. RMDS Lab manages GRMDS community.',
 'Global Association for Research Methods and Data Science is the leading community-centered data science research organization, creating a global platform which enables people to meet, learn, and collaborate with fellow data science professionals to solve problems. RMDS Lab manages GRMDS community.',
 'Global Association for Research Methods and Data Science is the leading community-centered data science research organization, creating a global platform which enables people to meet, learn, and collaborate with fellow data science professionals to solve problems. RMDS Lab manages GRMDS community.',
 'Global Association for Research Methods and Data Science is the leading community-centere

In [None]:
question_text

['What is RMDS Lab?',
 'What is RMDS?',
 'RMDS Lab is what?',
 'who are you?',
 'RMDS Lab?',
 'what is the mission of RMDS Lab?',
 'what is the goal of RMDS Lab?',
 'why RMDS Lab?',
 'what does RMDS Lab mean?',
 'what does RMDS do?',
 'why choose RMDS?',
 'what can I benefit from RMDS?',
 'where can I learn more about RMDS?',
 'where to find RMDS?',
 'what services do you have?',
 'what services does RMDS Lab have?',
 'what does RMDS Lab do?',
 'why choose RMDS lab?',
 'why is RMDS Lab useful?',
 'what services can RMDS Lab provide me?',
 'why is RMDS Lab legit?',
 'what services/products do RMDS Lab offer?',
 "where can I learn more about RMDS's services and products?",
 "where to find RMDS's services and products?",
 'what courses do you have?',
 'How can you help me?',
 'do you have any courses?',
 'Courses?',
 'do you have any classes?',
 'what courses can I take?',
 'how can I improve my data science skills?',
 'what kind of skills I have to learn?',
 'what can I learn from RMDS c

In [None]:
######## auto input user questions

# input user question
new_question = """what is nba?"""
## embedding question
encoded_question = rmds_model.encode([new_question])

## calculate the distance
distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

for idx, distance in results[0:3]: # just getting top 1
    if 1-distance>0.5:
      print(f"Your answer:")
      # print(question_texts[idx])
      print(f"cosine_score:{1-distance}, {corresponding_answer[idx]}")
    else:
      print("No")

No
No
No


In [None]:
print(question_text[1])
corresponding_answer[1]


What is RMDS?


'Global Association for Research Methods and Data Science is the leading community-centered data science research organization, creating a global platform which enables people to meet, learn, and collaborate with fellow data science professionals to solve problems. RMDS Lab manages GRMDS community.'

In [None]:
results

[(3, 0.6305254458353631),
 (1, 0.646847260918596),
 (82, 0.6502782181283325),
 (417, 0.6535311109822594),
 (13, 0.6541258664879501),
 (415, 0.6555561689693297),
 (414, 0.6617331744575232),
 (418, 0.6700938102192653),
 (409, 0.6726551318984781),
 (416, 0.6728784870336614),
 (83, 0.6759569765808487),
 (400, 0.6766265510165783),
 (443, 0.6788716203218825),
 (75, 0.6811160474712155),
 (436, 0.6825916570913113),
 (447, 0.6825916570913113),
 (60, 0.6839600255606431),
 (428, 0.6844248721604869),
 (49, 0.6874340229395847),
 (14, 0.6901151387719335),
 (437, 0.6907656462455569),
 (411, 0.6942244254633491),
 (439, 0.6951912620683909),
 (86, 0.6960820876216774),
 (9, 0.6966628603686558),
 (77, 0.6978499097044712),
 (431, 0.7064240990568109),
 (91, 0.7068422860193705),
 (413, 0.7068669934693781),
 (446, 0.7070433488731607),
 (12, 0.7080705510979424),
 (55, 0.7082867133160848),
 (410, 0.709897717224957),
 (401, 0.7107894475987206),
 (81, 0.7110140381001073),
 (432, 0.7113090257270671),
 (64, 0.71145

In [None]:
# convert it to a function
def QA_bot(input_question,question_embeddings, corresponding_answer):
  # input user question
  new_question = input_question
  ## embedding question
  encoded_question = rmds_model.encode([new_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  
  for idx, distance in results[0:1]: # just getting top 1
      return idx,results[0:1],"Your answer:\n{}\n".format(corresponding_answer[idx])

In [None]:
#Backup question list
Question_Backup = ['What is RMDS Lab?'
,'Why choose RMDS Lab?'
,'Where can I learn more about RMDS?'
,'What courses do you have?'
,'What kind of events do you have?'
,'What is NFT Market?'
,'Where can I learn more about NFT Market?'
,'What is NFT?']

In [None]:
cate_B_list = [str(x).lower().split(' /') for x in df_2[3]]
list_B = []
for i in cate_B_list:
  for j in i:
    if j not in list_B and j != 'nan':

      list_B.append(j)
print(list_B)


['rmds', ' mission', ' goal', 'service', ' product', 'study', ' course', ' class', ' training', 'event ', 'event', 'project portal', 'data science portfolio', 'rmds exchange', 'nft market', 'rm4es', ' workflow', 'researchmap', 'ecosystem', 'big data', ' ai', ' business', ' data science', 'financial technology', ' cryptocurrency', 'ai bias', ' surveillance', 'data analysis', 'cdi', 'ai', ' ethics', ' decision making', 'webinar series', 'covid', ' literature review', ' model', 'python', 'knime', ' basic', 'chatbot', ' healthcare', 'machine learning', ' ml', 'grmds', 'imdata', 'quaterly competition', 'nft', 'nft exchange', 'upskill program', ' pi program', ' internship', 'impact score', 'recommendation system', 'practical data science training', 'pre-conference training']


In [None]:
#StopWords function
nlp = spacy.load('en_core_web_sm')

def CleanStopWords(question):
  cate_B_list = [str(x).lower().split(' /') for x in df_2[3]]
  list_B = []
  hate_word_list = ['hate','detestation','disgust','enmity','hatred','horror','loath','malice']
  for i in cate_B_list:
    for j in i:
      if j not in list_B and j != 'nan':
        list_B.append(j)
  StopWords = ['is','are','be','should','does','do','can','that','to','for','on','at','from','by','and']
  line = nlp(question)
  question_tokens = [str(token).lower() for token in line if str(token) not in StopWords]
  bonus_count = 0
  hate_count = 0
  bonus_list = []
  for token in question_tokens:
    if token in list_B:
      bonus_list.append(token)
      bonus_count+=1
    if token in hate_word_list:
      hate_count+=1
  return ' '.join(question_tokens), bonus_count, bonus_list, hate_count  

CleanStopWords('what is rmds and imdata, I hate rmds ?')

('what rmds imdata , i hate rmds ?', 3, ['rmds', 'imdata', 'rmds'], 1)

In [6]:

############## bert similar version ##############
question = input("\nPlease enter your question: \n")
question = CleanStopWords(question)
import random
condition_N = True
while True:
  if question in ["No", "no","not thing", "that's it","N"]:
    print("\nSee you next time!")
    break
  # question_answer(question)
  default = ["Sorry, I don't understand what you mean.","Sorry about this. If you need further assistance, you can email us at info@rmdslab.com or give us a call 1-626-432-7266"]
  index, sim_a, ans = QA_bot(question, question_embeddings, corresponding_answer)
  sim_b = list(sim_a[0])[1]
  if sim_b > 0.3 or question.lower() in ["what","why","where","when","how"]:
    print(default[random.randint(0,1)])
    question = input("\nOR Please enter your question: \n")
    question = CleanStopWords(question)
  else:
    print(ans)
    
    answers = df_2.loc[df_2[1]==question_text[index]]
    x = answers[3].to_list()
    y = ''.join(x)

    category = df_2.groupby(0).apply(lambda x: x.iloc[0]).reset_index(drop=True)
    print(category)
    index = []
    question_list=[]
    count = 0
    num = 1
    for x in category[3]:
      if x == y:
        index.append(count)
      count+=1
    for i in index:
      print(num,category[1][i])
      question_list.append([num,category[1][i]])
      num+=1
    num_2 = num
    # Backup Questions
    if num<6:
      Question_Backup_2 = [x for x in Question_Backup if str(x) not in question_list]
      backup_list = random.choices(Question_Backup_2, k=int(6-num))
      print("\n*More you might want to know*")
      for x in backup_list:
        print(num_2,x)
        question_list.append([num_2,x])
        num_2+=1
    flag = True
    flag_N = False
    while flag:
      response = input("\nDo you want to know more about {}?(You can type other questions) \n".format(y))
      if response.isnumeric():
        if response in ["{:d}".format(x) for x in range(1,num_2)]:
          question = CleanStopWords(question_list[int(response)-1][1])
          flag = False
        else:
          print("\nPlease enter an integer in the list.")
          
      elif response not in ["No", "no","not thing", "that's it","N"]:
        # question = input("\nPlease enter your question: \n")
        question = CleanStopWords(response)
        flag = False
        condition_N = True
      else:
        print("\nSee you next time!")
        flag = False
        flag_N = True
    if flag_N == True:
      break 


Please enter your question: 
dd


NameError: name 'CleanStopWords' is not defined

In [None]:
#############################################

# Complex bert model (BertForQuestionAnswering) !!!

In [None]:
# Loading BERT model already fine-tuned on SQuAD Question Answer Dataset. This 1.3 GB download and may take sometime
# Note that we are using uncased model so all answers will be in lower case
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
# Getting bert tokenizer
tokenizer_for_bert = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

## Main function (Bert)

In [None]:
def bert_answering_machine ( question, passage, max_len =  512):
    ''' Function to provide answer from passage for question asked.
        This function takes question as well as the passage 
        It retuns answer from the passage, along with start/end token index for the answer and start/end token scores
        The scores can be used to rank answers if we are searching answers for same question in multiple passages
        Value of max_len can not exceed 512. If length of question + passage + special tokens is bigger than max_len, function will truncate extra portion.
        
    '''
  
    #Tokenize input question and passage. Keeping maximum number of tokens as specified by max_len parameter. This will also add special tokens - [CLS] and [SEP]
    input_ids = tokenizer_for_bert.encode (question, passage,  max_length= max_len, truncation=True)  
    ## ↑ [101,2029,2110,2198,1005,1055,2767,3268,102,2026,2171,2003,2198,1012,1045,2444,1999,2624,4560,1010,2662,1012,6487,2003,2026,2767,1012,2002,3268,1999,5862,1010,2899,102]
    
    #Getting number of tokens in 1st sentence (question) and 2nd sentence (passage)
    cls_index = input_ids.index(102) #Getting index of first SEP token
    len_question = cls_index + 1       # length of question (1st sentence)
    len_answer = len(input_ids)- len_question  # length of answer (2nd sentence)
    
    
    #BERT need Segment Ids to understand which tokens belong to sentence 1 and which to sentence 2
    segment_ids =  [0]*len_question + [1]*(len_answer)  #Segment ids will be 0 for question and 1 for answer
    
    #Converting token ids to tokens
    tokens = tokenizer_for_bert.convert_ids_to_tokens(input_ids) 
    ## ↑ ['[CLS]','which','state','john',"'",'s','friend','lives','[SEP]','my','name','is','john','.','i','live','in','san','jose',',','california','.','rob','is','my','friend','.','he','lives','in','seattle',',','washington','[SEP]']
    
    # getting start and end scores for answer. Converting input arrays to torch tensors before passing to the model
    start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[0]
    end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[1]
   
    #Converting scores tensors to numpy arrays so that we can use numpy functions
    start_token_scores = start_token_scores.detach().numpy().flatten()
    end_token_scores = end_token_scores.detach().numpy().flatten()
    
    #Picking start index and end index of answer based on start/end indices with highest scores
    answer_start_index = np.argmax(start_token_scores)
    answer_end_index = np.argmax(end_token_scores)
   
    #Getting scores for start token and end token of the answer. Also rounding it to 2 decimal digits
    start_token_score = np.round(start_token_scores[answer_start_index], 2)
    end_token_score = np.round(end_token_scores[answer_end_index], 2)
    
   
    #Combining subwords starting with ## so that we can see full words in output. Note tokenizer breaks words which are not in its vocab.
    answer = tokens[answer_start_index] #Answer starts with start index, we got based on highest score
    for i in range(answer_start_index + 1, answer_end_index + 1):
        if tokens[i][0:2] == '##':  # Token for a splitted word starts with ##
            answer += tokens[i][2:] # If token start with ## we remove ## and combine it with previous word so as to restore the unsplitted word
        else:
            answer += ' ' + tokens[i]  # If token does not start with ## we just put a space in between while combining tokens
            
    # Few patterns indicating that BERT does not get answer from the passage for question asked
    if ( answer_start_index == 0) or (start_token_score < 0 ) or  (answer == '[SEP]') or ( answer_end_index <  answer_start_index):
        answer = "Sorry!, I could not find  an answer in the passage."
    
    return ( answer_start_index, answer_end_index, start_token_score, end_token_score,  answer)
    # return (answer)



In [None]:
#Testing function
bert_answering_machine ("Which state john's friend lives", 'My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington')

(32, 32, 6.13, 6.94, 'washington')

## BERT_QA bot from Multiple_Texts

In [None]:
#  to get answer from an array of passages
def get_answer(q, p_array):
    score_list = []
    ans_list = []
    j_list = []
    for j in range (len(p_array)):  
        #p = preprocess(p_array[j] )
        p = p_array[j] 

        start, end , start_score, end_score,  ans = bert_answering_machine (q, p)
        #print( '\nText num:', j, 'Score:', start_score, end_score, '\nBERT Answer:', ans)
        
        if (start != 0) and (start_score > 0.25)  and (ans != '[SEP]')  :
            score_list.append(str(start_score) + ' and ' + str(end_score))
            ans_list.append(ans)
            j_list.append(j)
        else:
            text_num = None
            token_scores = None
            answer = "No Answer From BERT"

            
    if len(score_list) > 0 :
        ind = np.argmax(score_list)
        #print( 'Text number:', j_list[ind], ',  Token Scores:', score_list[ind], '\nBERT Answer:', ans_list[ind])
        text_num = j_list[ind]
        token_scores = score_list[ind]
        answer = ans_list[ind]
    else:
        text_num = None
        token_scores = None
        answer = "No Answer From BERT"
    return text_num, token_scores, answer

In [None]:
# passing 3 pagges and get_answer gets the  answer from best passege
passages_array=["I am a student , I study in UC Davis. I like to play Tennis",
    "John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis", 
 "My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington, My sister is Kelly. " ]

for i in range(len(passages_array)):
    print (f'Passage: {i} : {passages_array[i]}\n')
question ="Which college does John's sister attend"  

passage_num, scores, answer = get_answer(question, passages_array)

print (f'The question: {question} \n Answer: {answer} , Passage Index Where Answer Was Found: {passage_num}, Scores: {scores}')

Passage: 0 : I am a student , I study in UC Davis. I like to play Tennis

Passage: 1 : John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis

Passage: 2 : My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington, My sister is Kelly. 

The question: Which college does John's sister attend 
 Answer: uc davis , Passage Index Where Answer Was Found: 1, Scores: 5.83 and 6.35


## Document Segmenting

Now that we have our document corpus, we need to make sure that each document is short enough to fit into the 512 token limit of BERT. If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.


In [None]:
def segment_documents(docs, max_doc_length=450):
  # List containing full and segmented docs
  segmented_docs = []

  for doc in docs:
    # Split document by spaces to obtain a word count that roughly approximates the token count
    split_to_words = doc.split(" ")

    # If the document is longer than our maximum length, split it up into smaller segments and add them to the list 
    if len(split_to_words) > max_doc_length:
      for doc_segment in range(0, len(split_to_words), max_doc_length):
        segmented_docs.append( " ".join(split_to_words[doc_segment:doc_segment + max_doc_length]))

    # If the document is shorter than our maximum length, add it to the list
    else:
      segmented_docs.append(doc)

  return segmented_docs

In [None]:
# contents = segment_documents(df_long_text['Content'].to_list())
# contents

## Finding Relevant Documents

Next, our goal is to find within this corpus the subset of documents that are most likely to contain our answer, because running every single document through our BERT model is expensive and doesn't help us narrow down a good answer. For this example, we'll simply use the scikit-learn TF-IDF vectorizer to convert our documents and our query into vectors. 

The document vectors with the highest cosine similarity to our query vector will be the best candidates to search for our answer, and we will feed these top candidate documents into the SQUAD model to get our predicted answers.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_articles(query, docs, k=2):

  # Initialize a vectorizer that removes English stop words
  vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

  # Create a corpus of query and documents and convert to TFIDF vectors
  query_and_docs = [query] + docs
  matrix = vectorizer.fit_transform(query_and_docs)

  # Holds our cosine similarity scores
  scores = []

  # The first vector is our query text, so compute the similarity of our query against all document vectors
  for i in range(1, len(query_and_docs)):
    scores.append(cosine_similarity(matrix[0], matrix[i])[0][0])

  # Sort list of scores and return the top k highest scoring documents
  sorted_list = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
  top_doc_indices = [x[0] for x in sorted_list[:k]]
  top_docs = [docs[x] for x in top_doc_indices]
  
  return top_docs

In [None]:
##### test
# myself test based on own csv file
df_long_text = pd.read_excel("(for test)[Chatbot] Content - Copy.xlsx")

# Segment our documents (If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.)
contents = segment_documents(df_long_text['Content'].to_list())

# question ="when was nba founded?"  
question ="what is nba?"



# Retrieve the top k most relevant documents to the query ()
candidate_docs = get_top_k_articles(question, contents, 2)

passage_num, scores, answer = get_answer(question, candidate_docs)
print(f'Answer: {answer}')

# print (f'Answer: {answer} , Passage Index Where Answer Was Found: {passage_num}, Scores: {scores}')

FileNotFoundError: [Errno 2] No such file or directory: '(for test)[Chatbot] Content - Copy.xlsx'

In [None]:
### convert it to function:
def bert_text_search_start_end_QA(que, documents):
  # Segment our documents (If a document is longer than 512 tokens, we'll simply segment it into multiple smaller chunks and add them to the final corpus.)
  contents = segment_documents(documents)

  question =que

  # Retrieve the top k most relevant documents to the query ()
  candidate_docs = get_top_k_articles(question, contents, 2)

  passage_num, scores, answer = get_answer(question, candidate_docs)
  return answer


# Combination version (version02)!!!!!!!!!

In [None]:
df_long_text = pd.read_excel("(for test)[Chatbot] Content - Copy.xlsx")
contents = segment_documents(df_long_text['Content'].to_list())

In [None]:
# convert it to a function
def QA_bot(input_question,question_embeddings, corresponding_answer):
  ## embedding question
  encoded_question = rmds_model.encode([input_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  for idx, distance in results[0:1]: # just getting top 1
      print(f"Your answer:")
      print(corresponding_answer[idx])

In [None]:
###################### test version(convert it to function) ######################
def combination_model(input_question, question_embeddings, corresponding_answer):
  ## embedding question
  encoded_question = rmds_model.encode([input_question])

  ## calculate the distance
  distances = spatial.distance.cdist(np.array(encoded_question), question_embeddings, 'cosine')[0]
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  for idx, distance in results[0:1]: # just getting top 1
      if 1-distance>0.5:
        print(f"Your answer:")
        # print(question_texts[idx])
        print(f"cosine_score:{1-distance}, {corresponding_answer[idx]}")
      else:
        bert_text_search_start_end_QA(input_question, contents)
        print(f'Answer: {answer}')

In [None]:
############## combination QA chatbot version!!!!!!!!!!!!!!!!!!!!!!!! ##############

question = input("\nPlease enter your question: \n")

while True:
  # question_answer(question)
  combination_model(question, question_embeddings, corresponding_answer)

  flag = True
  flag_N = False
  while flag:
    response = input("\nDo you have other questions? \n ")
    if response not in ["No", "no","not thing", "that's it","N"]:
      # question = input("\nPlease enter your question: \n")
      question = response
      flag = False
    else:
      print("\nSee you next time!")
      flag = False
      flag_N = True
  if flag_N == True:
    break


Please enter your question: 
nba
Answer: the national basketball association

Do you have other questions? 
 no

See you next time!
