<a href="https://colab.research.google.com/github/BenDoschGit/holbertonschool-machine_learning/blob/main/supervised_learning/0x13-qa_bot/QA_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [10]:
#@title Install modules
!pip install tensorflow==2.3  # --force-reinstall
!pip install numpy==1.18  # --force-reinstall
!pip install tensorflow-hub
!pip install transformers



In [11]:
#@title Imports & version checks
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
from transformers import BertTokenizer
print("TensorFlow", tf.__version__)
print("Numpy", np.__version__)
!python --version 

TensorFlow 2.3.0
Numpy 1.18.0
Python 3.7.12


In [12]:
#@title Mount drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Download zip files from [here](https://holbertonintranet.s3.amazonaws.com/uploads/misc/2020/11/c15a067b44a328c7d5a03c79070b7865f444d1e3.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARDDGGGOU5BHMTQX4%2F20220119%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220119T223958Z&X-Amz-Expires=345600&X-Amz-SignedHeaders=host&X-Amz-Signature=c6fc92cc2f1063ba24faab28477a5c0b46c362b514d51ee005aa540412cef32c)



In [None]:
#@title Unzip files 
# add > /dev/null to supress output
!unzip gdrive/My\ Drive/Datasets/ZendeskArticles.zip -d gdrive/My\ Drive/Datasets/

# 0-qa.py

In [13]:
def question_answer(question, reference):
    """Function that finds a snippet of text within a reference document to
    answer a question.

    Args:
        question (str): String containing the question to answer.
        reference (str): String containing the reference document from which to
            find the answer.

    Returns:
        answer (string): The answer found by the function. If no answer is
            found, returns None.
    """
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    model = hub.load("https://tfhub.dev/see--/bert-uncased-tf2-qa/1")


    question_tokens = tokenizer.tokenize(question)
    reference_tokens = tokenizer.tokenize(reference)
    tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + reference_tokens + ['[SEP]']
    input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_word_ids)
    input_type_ids = ([0] * (1 + len(question_tokens) + 1) + [1] *
                      (len(reference_tokens) + 1))
    input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
        tf.convert_to_tensor(t, dtype=tf.int32), 0),
        (input_word_ids, input_mask, input_type_ids))
    outputs = model([input_word_ids, input_mask, input_type_ids])

    # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
    short_start = tf.argmax(outputs[0][0][1:]) + 1
    short_end = tf.argmax(outputs[1][0][1:]) + 1
    answer_tokens = tokens[short_start: short_end + 1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    return answer

In [14]:
with open('gdrive/MyDrive/Datasets/ZendeskArticles/PeerLearningDays.md') as f:
    reference = f.read()

print(question_answer('When are PLDs?', reference))

INFO:absl:Using /tmp/tfhub_modules to cache modules.


on - site days from 9 : 00 am to 3 : 00 pm


### Expected ouptut
~~~
on - site days from 9 : 00 am to 3 : 00 pm
~~~

# 1-loop.py

In [1]:
exits = ('exit', 'quit', 'goodbye', 'bye')
while(True):
    Q = input("Q: ")
    if Q.lower() in exits:
        print('A: Goodbye')
        break
    else:
        print('A: ')

Q: 
A: 
Q: Hello
A: 
Q: Bye
A: Goodbye


### Example ouptut
~~~
Q: Hello
A:
Q: How are you?
A:
Q: BYE
A: Goodbye
~~~

# 2-qa.py

In [15]:
def answer_loop(reference):
    """Function that starts a loop that finds answers to questions by finding a 
    nippet of text within a reference document to answer a question.
    
    reference (str): String containing the reference document from which to
    find the answer.

    Type 'exit', 'quit', 'goodbye', or 'bye' to exit.
    """
    exits = ('exit', 'quit', 'goodbye', 'bye')
    
    while(True):
        Q = input("Q: ")
        if Q.lower() in exits:
            print('A: Goodbye')
            break
        else:
            A = question_answer(Q, reference)
            if A:
                print('A:', A)
            else:
                print('A: Sorry, I do not understand your question.')

In [16]:
with open('gdrive/MyDrive/Datasets/ZendeskArticles/PeerLearningDays.md') as f:
    reference = f.read()

answer_loop(reference)

Q: When are PLDs?
A: on - site days from 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
A: Sorry, I do not understand your question.
Q: What does PLD stand for?
A: peer learning days
Q: EXIT
A: Goodbye


### Example ouptut
~~~
Q: When are PLDs?
A: from 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
A: Sorry, I do not understand your question.
Q: What does PLD stand for?
A: peer learning days
Q: EXIT
A: Goodbye
~~~

# 3-semantic_search.py

In [17]:
def semantic_search(corpus_path, sentence):
    """Function that performs semantic search on a corpus of documents.
    
    Args:
        corpus_path (str): The path to the corpus of reference documents on
            which to perform semantic search.
        sentence (str): The sentence from which to perform semantic search.

    Returns:
        text (str): The reference text of the document most similar to sentence.
    """
    model_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
    model = hub.load(model_url)

    un_embedded = []
    un_embedded.append(sentence)

    for file in os.listdir(corpus_path):
        if file.endswith('.md'):
            with open(corpus_path + '/' + file, encoding="utf-8") as f:
                un_embedded.append(f.read())

    embedded = model(un_embedded)

    sentence_corr = np.inner(embedded, embedded)[0, 1:]
    
    max_document = np.argmax(sentence_corr)

    return un_embedded[1 + max_document]


In [18]:
print(semantic_search('gdrive/MyDrive/Datasets/ZendeskArticles', 'When are PLDs?'))

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.














PLD Overview
Peer Learning Days (PLDs) are a time for you and your peers to ensure that each of you understands the concepts you've encountered in your projects, as well as a time for everyone to collectively grow in technical, professional, and soft skills. During PLD, you will collaboratively review prior projects with a group of cohort peers.
PLD Basics
PLDs are mandatory on-site days from 9:00 AM to 3:00 PM. If you cannot be present or on time, you must use a PTO. 
No laptops, tablets, or screens are allowed until all tasks have been whiteboarded and understood by the entirety of your group. This time is for whiteboarding, dialogue, and active peer collaboration. After this, you may return to computers with each other to pair or group program. 
Peer Learning Days are not about sharing solutions. This doesn't empower peers with the ability to solve problems themselves! Peer learning is when you share your thought process, whether through conversation, whiteboarding, debugging, or li

### Expected ouptut
~~~
PLD Overview
Peer Learning Days (PLDs) are a time for you and your peers to ensure that each of you understands the concepts you've encountered in your projects, as well as a time for everyone to collectively grow in technical, professional, and soft skills. During PLD, you will collaboratively review prior projects with a group of cohort peers.
PLD Basics
PLDs are mandatory on-site days from 9:00 AM to 3:00 PM. If you cannot be present or on time, you must use a PTO. 
No laptops, tablets, or screens are allowed until all tasks have been whiteboarded and understood by the entirety of your group. This time is for whiteboarding, dialogue, and active peer collaboration. After this, you may return to computers with each other to pair or group program. 
Peer Learning Days are not about sharing solutions. This doesn't empower peers with the ability to solve problems themselves! Peer learning is when you share your thought process, whether through conversation, whiteboarding, debugging, or live coding. 
When a peer has a question, rather than offering the solution, ask the following:
"How did you come to that conclusion?"
"What have you tried?"
"Did the man page give you a lead?"
"Did you think about this concept?"
Modeling this form of thinking for one another is invaluable and will strengthen your entire cohort.
Your ability to articulate your knowledge is a crucial skill and will be required to succeed during technical interviews and through your career.
~~~




# 4-qa.py

In [19]:
def question_answer_improved(coprus_path):
    """Function that answers questions from multiple reference texts.
    
    Args:
        corpus_path (str): The path to the corpus of reference documents.
    """
    exits = ('exit', 'quit', 'goodbye', 'bye')
    
    while(True):
        Q = input("Q: ")
        if Q.lower() in exits:
            print('A: Goodbye')
            break
        else:
            reference = semantic_search(coprus_path, Q)
            A = question_answer(Q, reference)
            if A:
                print('A:', A)
            else:
                print('A: Sorry, I do not understand your question.')

In [None]:
question_answer_improved('gdrive/MyDrive/Datasets/ZendeskArticles')

Q: When are PLDs?
















A: on - site days from 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
















A: help you train for technical interviews
Q: goodbye
A: Goodbye


### Example ouptut
~~~
Q: When are PLDs?
A: on - site days from 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
A: help you train for technical interviews
Q: What does PLD stand for?
A: peer learning days
Q: goodbye
A: Goodbye
~~~