# Dev Notebook for DM R&D
Maintainer: Alberto Chierici

alberto.chierici@nyu.edu

## set environment

In [278]:
# ! pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_paraphrase_distilroberta_base_v1-0.1.2.tar.gz#en_paraphrase_distilroberta_base_v1-0.1.2

In [2]:
import sqlalchemy as db
from sqlalchemy.sql import text
import pandas as pd
import numpy as np
import numpy
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
import spacy
import spacy_sentence_bert

**Uplaod language models**

In [2]:
NLP = spacy.load("en_core_web_lg")
# NLP_BERT = spacy.load("en_paraphrase_distilroberta_base_v1")
# NLP_BERT = spacy_sentence_bert.load_model('multi-qa-mpnet-base-dot-v1')
NLP_BERT = spacy.blank('en')
NLP_BERT.add_pipe('sentence_bert', config={'model_name': 'multi-qa-MiniLM-L6-cos-v1'})
NLP_BERT.pipe_names

['sentence_bert']

### get data

In [3]:
# stream_ids = '(5, 7, 8, 9, 12, 17, 18, 19, 20, 21)'
stream_id = 2
TOIA_ID = 2

In [5]:
sql_url = "mysql+mysqlconnector://root:anypasswords@localhost:3307/toia"

engine = db.create_engine(sql_url)
connection = engine.connect()
metadata = db.MetaData()

# statement = text(f"""
#     SELECT vqs.id_stream as stream_id_stream, 
#         vqs.type, q.question, v.answer, v.id_video, q.id
#     FROM video v
#     JOIN videos_questions_streams vqs ON vqs.id_video = v.id_video
#     JOIN questions q ON q.id = vqs.id_question
#     WHERE vqs.id_stream in {stream_ids}
#     AND v.private = 0 
#     AND vqs.type NOT IN ('filler');""")

statement = text(f"""
    SELECT vqs.id_stream as stream_id_stream, 
        vqs.type, q.question, v.answer, v.id_video
    FROM video v
    JOIN videos_questions_streams vqs ON vqs.id_video = v.id_video
    JOIN questions q ON q.id = vqs.id_question
    WHERE vqs.id_stream = {stream_id}
    AND v.private = 0 
    AND vqs.type NOT IN ('filler');""")

# ResultProxy = connection.execute(avatar_kb)
ResultProxy = connection.execute(statement)
ResultSet = ResultProxy.fetchall()

# df_avatar = pd.DataFrame(ResultSet, 
#              columns=[
#                  'stream_id_stream',
#                  'type',
#                  'question',
#                  'answer',
#                  'id_video',
#                  'id_question',
#              ])

df_avatar = pd.DataFrame(ResultSet, 
             columns=[
                 'stream_id_stream',
                 'type',
                 'question',
                 'answer',
                 'id_video',
             ])

In [6]:
df_avatar

Unnamed: 0,stream_id_stream,type,question,answer,id_video
0,2,no-answer,Say: I don't have an answer to that right now.,.,toia7_2_11_e51a1eac.mp4
1,2,answer,tell me something nice to do in Abu Dhabi,I love going to the desert.,toia7_2_1212_7df57a33.mp4
2,2,answer,What advice would you give to someone who is c...,I recommend you start coding as soon as possible.,toia7_2_1226_fe0f2cea.mp4
3,2,no-answer,"Say: Sorry, I didn't record answers to that qu...","Sorry, I didn't record answers to that question.",toia7_2_12_d7c9f2a0.mp4
4,2,answer,Tell me your favorite food,My favorite food is steak and fries.,toia7_2_1354_2bbe3613.mp4
...,...,...,...,...,...
94,2,y/n-answer,Do you like Yoga?,No.,toia7_2_747_86218594.mp4
95,2,y/n-answer,Do you like cold weather?,No.,toia7_2_747_86218594.mp4
96,2,answer,What is your name?,My name is Alberto.,toia7_2_749_ab32149e.mp4
97,2,answer,How do I call you?,"You can call me, Alberto.",toia7_2_817_9727d0b6.mp4


In [541]:
df_first_10 = df_avatar.sort_values(by=['stream_id_stream', 'id_question']).groupby('stream_id_stream').nth(range(10, 20))
df_avatar = df_first_10.reset_index().copy()

## NLP analysis with spaCy

In [534]:
docs = NLP.pipe(df_avatar['question'].values)
# for doc in docs:
#     print("--- next doc ---")
#     print(doc)
#     for token in doc:
#         print(token.text, token.pos_, token.tag_)
#     for ent in doc.ents:
#         print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [367]:
doc = NLP("hey, hi!")

In [368]:
['INTJ', 'UH'] in [[token.pos_, token.tag_] for token in doc]

True

In [369]:
set([token.tag_ for token in doc])

{',', '.', 'UH'}

In [370]:
df_greetings = df_avatar[df_avatar['type'] == "greeting"]
if df_greetings.shape[0] > 0:
    df_greetings.sample(n=1)['answer'].values[0]
else:
    print("204 No Content: you haven't recorded greetings")
        

## helper functions

In [371]:
def process_text(text):
    doc = NLP(text.lower())
    result = []
    for token in doc:
        if token.text in NLP.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

In [372]:
def calculate_similarity(text1, text2):
    base = NLP(process_text(text1))
    compare = NLP(process_text(text2))
    return base.similarity(compare)
# def calculate_similarity(text1, text2):
#     base = NLP_BERT(text1)
#     compare = NLP_BERT(text2)
#     return base.similarity(compare)

In [373]:
ps = SnowballStemmer('english')

def preprocess(text):
            # Stem and remove stopwords
            text = re.sub('[^a-zA-Z]', ' ', text)
            text = text.lower()
            text = text.split()
            text = [ps.stem(word) for word in text]  # if not word in set(stopwords.words('english'))]
            return ' '.join(text)


def toia_answer(query, dataset, k=1):
    doc = NLP(query)
    # if Greeting, greet
    if ['INTJ', 'UH'] in [[token.pos_, token.tag_] for token in doc]:    
        if df_greetings.shape[0] > 0:
            answers = dataset[dataset['type'] == "greeting"].sample(n=1)
            return answers['answer'].values[0], answers['id_video'].values[0]
        else:
            df_noanswers = dataset[dataset['type'] == "no-answer"]
            if df_noanswers.shape[0] > 0:
                answers = df_noanswers.sample(n=1)
                return answers['answer'].values[0], answers['id_video'].values[0], "Record some reetings"
            else:
                return "You haven't recorded greetings nor no-answers", "204", "No Content"

    querycorpus = []
    for i in range(0, len(dataset)):
        userquestion = preprocess(dataset['question'][i])
        querycorpus.append(userquestion)

    # Creating the Bag of Words model with TFIDF and calc cosine_similarity
    vectorizer = CountVectorizer(decode_error="replace")
    # this is needed to get the attribute vocabulary_
    vec_train = vectorizer.fit_transform(querycorpus)
    training_vocabulary = vectorizer.vocabulary_
    transformer = TfidfTransformer()
    trainingvoc_vectorizer = CountVectorizer(
        decode_error="replace", vocabulary=training_vocabulary)
    tfidf_querycorpus = TfidfVectorizer().fit_transform(querycorpus)

    tfidf_userquestion = transformer.fit_transform(
        trainingvoc_vectorizer.fit_transform(
            numpy.array([
                preprocess(query)
            ])))
    cosine_similarities = cosine_similarity(tfidf_userquestion, tfidf_querycorpus)
    related_docs_indices = (-cosine_similarities[0]).argsort()
    sorted_freq = cosine_similarities[0][related_docs_indices]

    # note for this distance the problem we had befor with inf, we have now with 0. Again we decide
    # to make the prediction a bit random. This could be adjusted to remove any 0 distance and
    # pick the only ones left if any, and if none predict 1.

    if sum(sorted_freq) == 0:
        df_noanswers = dataset[dataset['type'] == "no-answer"]
        if df_noanswers.shape[0] > 0:
            answers = df_noanswers.sample(n=1)
            return answers['answer'].values[0], answers['id_video'].values[0], "tfidf all sim 0"
        else:
            return "You haven't recorded no-answers", "204", "No Content"
    elif sorted_freq[0] > 0.7:  #(the top sorted freq is the max)
        if sorted_freq[k-1] != sorted_freq[k] or sorted_freq[k-1] == sorted_freq[k] == 0:
            selected = related_docs_indices[:k]
            return dataset.iloc[selected[0]]['answer'], dataset.iloc[selected[0]]['id_video'], f"tfidf sim: {sorted_freq[:k]}"
        else:
            indeces = numpy.where(numpy.roll(sorted_freq, 1) != sorted_freq)
            selected = related_docs_indices[:indeces[0][indeces[0] >= k][0]]
            return dataset.iloc[selected[0]]['answer'], dataset.iloc[selected[0]]['id_video'], f"tfidf sim: {sorted_freq[:k]}"

    else:
        docs = NLP.pipe(dataset['question'].values)
        cosine_similarities = [calculate_similarity(query, doc.text) for doc in docs]
        if max(cosine_similarities) > 0.5:
            related_docs_indices = np.argsort(cosine_similarities)[::-1]
            selected = related_docs_indices[:k][0]
            return dataset.iloc[selected]['answer'], dataset.iloc[selected]['id_video'], f"spaCy sim: {cosine_similarities[selected]}"
        else:
            df_noanswers = dataset[dataset['type'] == "no-answer"]
            if df_noanswers.shape[0] > 0:
                answers = df_noanswers.sample(n=1)
                return answers['answer'].values[0], answers['id_video'].values[0], "spaCy sim below thr"
            else:
                return "You haven't recorded no-answers", "204", "No Content"

In [375]:
# toia_answer("Do you like swimming?", df_avatar)

***
## use transformers models

### testing update helper functions

In [428]:
from transformers import AutoTokenizer, AutoModel
import torch
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


# #CLS Pooling - Take output from first token
# def cls_pooling(model_output):
#     return model_output.last_hidden_state[:,0]

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    # embeddings = cls_pooling(model_output)
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return embeddings

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [543]:
#Encode docs
doc_emb = encode(df_avatar['answer'].values.tolist())

In [544]:
# query = "what's your name"
# dataset = df_avatar.copy()
# k = 1
def toia_answer_new(query, dataset, doc_emb, k=5):
    doc = NLP(query)
    # if Greeting, greet
    if ['INTJ', 'UH'] in [[token.pos_, token.tag_] for token in doc]:
        if dataset[dataset['type'] == "greeting"].shape[0] > 0:
            answers = dataset[dataset['type'] == "greeting"].sample(n=1)
            return answers['question'].values[0], answers['answer'].values[0], answers['id_video'].values[0]
        else:
            answers = dataset[dataset['type'] == "no-answer"].sample(n=1)
            return answers['question'].values[0], answers['answer'].values[0], answers['id_video'].values[0], "No greetings recorded"

    querycorpus = []
    for i in range(0, len(dataset)):
        userquestion = preprocess(dataset['question'][i])
        querycorpus.append(userquestion)

    # Creating the Bag of Words model with TFIDF and calc cosine_similarity
    vectorizer = CountVectorizer(decode_error="replace")
    # this is needed to get the attribute vocabulary_
    vec_train = vectorizer.fit_transform(querycorpus)
    training_vocabulary = vectorizer.vocabulary_
    transformer = TfidfTransformer()
    trainingvoc_vectorizer = CountVectorizer(
        decode_error="replace", vocabulary=training_vocabulary)
    tfidf_querycorpus = TfidfVectorizer().fit_transform(querycorpus)

    tfidf_userquestion = transformer.fit_transform(
        trainingvoc_vectorizer.fit_transform(
            numpy.array([
                preprocess(query)
            ])))
    cosine_similarities = cosine_similarity(tfidf_userquestion, tfidf_querycorpus)
    related_docs_indices = (-cosine_similarities[0]).argsort()
    sorted_freq = cosine_similarities[0][related_docs_indices]

    # note for this distance the problem we had befor with inf, we have now with 0. Again we decide
    # to make the prediction a bit random. This could be adjusted to remove any 0 distance and
    # pick the only ones left if any, and if none predict 1.

    if sum(sorted_freq) == 0:
        answers = dataset[dataset['type'] == "no-answer"].sample(n=1)
        return answers['question'].values[0], answers['answer'].values[0], answers['id_video'].values[0], "tfidf sim all 0"

    elif sorted_freq[0] > 0.85:  #(the top sorted freq is the max)
        if sorted_freq[k-1] != sorted_freq[k] or sorted_freq[k-1] == sorted_freq[k] == 0:
            selected = related_docs_indices[:k]
            return dataset.iloc[selected[0]]['question'], dataset.iloc[selected[0]]['answer'], dataset.iloc[selected[0]]['id_video'], f"tfidf sim: {sorted_freq[:k]}"
        else:
            indeces = numpy.where(numpy.roll(sorted_freq, 1) != sorted_freq)
            selected = related_docs_indices[:indeces[0][indeces[0] >= k][0]]
            return dataset.iloc[selected[0]]['question'], dataset.iloc[selected[0]]['answer'], dataset.iloc[selected[0]]['id_video'], f"tfidf sim: {sorted_freq[:k]}"

    else:
        #Encode query
        query_emb = encode(query)
        #Compute dot score between query and all document embeddings
        scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
        if max(scores) > 0.4:
            related_docs_indices = np.argsort(scores)[::-1]
            selected = related_docs_indices[:k][0]
            return dataset.iloc[selected]['question'], dataset.iloc[selected]['answer'], dataset.iloc[selected]['id_video'], f"Trsf sim: {scores[selected]}"
        else:
            answers = dataset[dataset['type'] == "no-answer"].sample(n=1)
            return answers['question'].values[0], answers['answer'].values[0], answers['id_video'].values[0], f"Trsf sim: {max(scores)}"

            
# For testing function, convert all 'return' to 'return_a ='
# return_a  

In [545]:
toia_answer_new("Do you have a hobby?", df_avatar, doc_emb)

('Possible questions:\n\n What are your hobbies?',
 'I like to travel and cook.',
 'toia159_8_510_89d3de0d.mp4',
 'Trsf sim: 0.4560806155204773')

In [546]:
def run_toia(dataset, doc_emb):
    print("TOIA starts")

    while True:
        query = input('> ')
        if query == "stop":
            break

        output = toia_answer_new(query, dataset, doc_emb)
        if output is None:
            break

        print(output)

In [547]:
run_toia(df_avatar, doc_emb)

TOIA starts


>  what do you do


('What do you do?', "I'm student here at New York University Abu Dhabi.", 'toia130_5_1013_64d9c7b8.mp4', 'tfidf sim: [0.98675727 0.98675727 0.98675727 0.66467081 0.56994491]')


>  what do you study


('What do you study?', 'I am a first year student at college and I study Economics.', 'toia161_20_893_a3d33b2d.mp4', 'tfidf sim: [0.90108486 0.7636529  0.7636529  0.7636529  0.6742295 ]')


>  where do you study


('What do you do?', 'I study computer science.', 'toia155_12_196_2818df47.mp4', 'Trsf sim: 0.5651909112930298')


>  what do you want to do after college


('What are your plans for after you graduate?', "As of now, I don't have any plans after graduation. Either it's going to be Masters. Or probably working in consulting or research. Cuz right now I don't have any specific.", 'toia189_21_872_0dbd05fb.mp4', 'Trsf sim: 0.5842505693435669')


>  do you have any pastime?


('What is your favorite thing to do on the weekends?', "Sometime, I love to cook something, mostly I love to prepare some cakes or something sweet, but I don't have a lot of time to do so.", 'toia201_17_589_613b98e9.mp4', 'Trsf sim: 0.4128788709640503')


>  what do you like cooking


('What is your favorite thing to do on the weekends?', "Sometime, I love to cook something, mostly I love to prepare some cakes or something sweet, but I don't have a lot of time to do so.", 'toia201_17_589_613b98e9.mp4', 'Trsf sim: 0.6654713153839111')


>  are you having fun


('What do you like to do in your spare time?', 'In my free time, I enjoy, creating new networks. Going out for a cup of coffee or dinner with new friends is my idea of fun.', 'toia201_17_1122_11bf90c0.mp4', 'Trsf sim: 0.5132184028625488')


>  stop


***
## using OpenAI GPT-3

In [1]:
from dotenv import dotenv_values
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
import numpy as np

config = dotenv_values()
openai.organization = config['YOUR_ORG_ID']
openai.api_key = config['OPENAI_API_KEY']
openai.Model.list()

<OpenAIObject list at 0x13a7e0e50> JSON: {
  "data": [
    {
      "created": 1649880484,
      "id": "text-davinci-002",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampling": true,
          "allow_search_indices": false,
          "allow_view": true,
          "created": 1657647746,
          "group": null,
          "id": "modelperm-vOEb19FFOLfIZy8y1GZipaze",
          "is_blocking": false,
          "object": "model_permission",
          "organization": "*"
        }
      ],
      "root": "text-davinci-002"
    },
    {
      "created": 1649364042,
      "id": "text-ada-001",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,

### embed

In [84]:
df_avatar['combined'] = "Question: " + df_avatar.question.str.strip() + "; Answer: " + df_avatar.answer.str.strip()
df_avatar.head(2)

Unnamed: 0,stream_id_stream,type,question,answer,id_video,combined,ada_similarity,ada_search
0,2,no-answer,Say: I don't have an answer to that right now.,.,toia7_2_11_e51a1eac.mp4,Question: Say: I don't have an answer to that ...,"[0.02549443207681179, -0.0048491512425243855, ...","[-0.0008500543772242963, 0.008533979766070843,..."
1,2,answer,tell me something nice to do in Abu Dhabi,I love going to the desert.,toia7_2_1212_7df57a33.mp4,Question: tell me something nice to do in Abu ...,"[-0.0030072664376348257, -0.000457754096714779...","[-0.0050692250952124596, 0.019424758851528168,..."


**Run the below only the first time**

Uncomment if needed.

In [None]:
# This will take just under 2 minutes
# df_avatar['ada_similarity'] = df_avatar.combined.apply(lambda x: get_embedding(x, engine='text-similarity-ada-001'))
# df_avatar['ada_search'] = df_avatar.combined.apply(lambda x: get_embedding(x, engine='text-search-ada-doc-001'))

In [None]:
# df_avatar.to_csv(f"""output/embedded_1k_toia_id_{TOIA_ID}.csv""")

and comment the above again if needed and used

In [85]:
df = pd.read_csv(f"""output/embedded_1k_toia_id_{TOIA_ID}.csv""")
df['ada_search'] = df.ada_search.apply(eval).apply(np.array)

In [86]:
def answer_retrieval(df, search_query, n=3, pprint=True):
    embedding = get_embedding(search_query, engine='text-search-ada-query-001')
    df['similarities'] = df.ada_search.apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities', ascending=False).head(n)
    res_print = res.combined.str.replace('Question: ', '').str.replace('; Answer:', ': ')
    if pprint:
        for r in res_print:
            print(r[:200])
            print()
    return res

In [87]:
res = answer_retrieval(df, "Something healthier?", n=3)

What are some of your favorite healthy foods?:  I like aubergines. There are so many dishes you can make with aubergines.

What are your favorite dishes that include aubergines?:  Melanzane alla parmigiana. Although that is not a very healthy dish!

What other hobbies do you have?:  I like to reading books, and going to the gym, especially doing high intensity interval training.



In [65]:
res

Unnamed: 0.1,Unnamed: 0,stream_id_stream,type,question,answer,id_video,combined,ada_similarity,ada_search,similarities
69,69,2,answer,What are some of your favorite healthy foods?,I like aubergines. There are so many dishes yo...,toia7_2_53_4882ce08.mp4,Question: What are some of your favorite healt...,"[-0.015448587015271187, 0.0037080717738717794,...","[-0.0013567984569817781, 0.03635244444012642, ...",0.32352
72,72,2,answer,What are your favorite dishes that include aub...,Melanzane alla parmigiana. Although that is no...,toia7_2_56_65a714e7.mp4,Question: What are your favorite dishes that i...,"[0.008402660489082336, 0.0036533307284116745, ...","[-0.003916587680578232, 0.01414294820278883, -...",0.279511
47,47,2,answer,What other hobbies do you have?,"I like to reading books, and going to the gym,...",toia7_2_24_8fcbbb75.mp4,Question: What other hobbies do you have?; Ans...,"[-0.0050799911841750145, -0.012672553770244122...","[-0.0015914089744910598, 0.0058600991033017635...",0.261735


### Test create_embeddings.py

In [10]:
sql_url = "mysql+mysqlconnector://root:anypasswords@localhost:3307/toia"

ENGINE = db.create_engine(sql_url)

toiaID = 2

In [12]:
retrieve_statement = text("""
        SELECT v.toia_id, q.question, v.answer, v.id_video, q.id as question_id FROM video v
        INNER JOIN videos_questions_streams vqs ON vqs.id_video = v.id_video
        INNER JOIN questions q ON q.id = vqs.id_question
        WHERE v.toia_id = :toiaID AND v.private = 0 AND vqs.type NOT IN ('filler', 'exit');""")

CONNECTION = ENGINE.connect()
result_proxy = CONNECTION.execute(retrieve_statement, toiaID=toiaID)
result_set = result_proxy.fetchall()

df_avatar = pd.DataFrame(result_set,
                            columns=[
                                'toia_id',
                                'question',
                                'answer',
                                'id_video',
                                'question_id'
                            ])

df_avatar['combined'] = "Question: " + df_avatar.question.str.strip() + "; Answer: " + df_avatar.answer.str.strip()

In [18]:
def Sorting(lst):
    lst2 = sorted(lst, key=len)
    return lst2

for comb in Sorting(df_avatar['combined'])[:30]:
    print(len(comb), comb, "\n***")

40 Question: Do you like Yoga?; Answer: No. 
***
41 Question: Do you like math?; Answer: Yes. 
***
42 Question: Hello!; Answer: Hi, how are you? 
***
42 Question: do you like Elon?; Answer: Yeah. 
***
42 Question: Do you like sushi?; Answer: Yes. 
***
43 Question: do you like Trump?; Answer: Yeah. 
***
44 Question: Do you like swimming?; Answer: No. 
***
44 Question: Do you like reading?; Answer: Yes. 
***
44 Question: Do you like swimming?; Answer: No. 
***
48 Question: Do you like cold weather?; Answer: No. 
***
54 Question: Do you like coffee?; Answer: Yes, of course. 
***
54 Question: Do you like gelato?; Answer: Yes, of course. 
***
56 Question: Record a greeting (e.g., hello, hi); Answer: . 
***
56 Question: Do you like italian restaurants?; Answer: Yes. 
***
57 Question: What is your name?; Answer: My name is Alberto. 
***
60 Question: Do you drink coffee?; Answer: Oh, yes, absolutely. 
***
63 Question: Do you like Blacksmith?; Answer: Oh, yes, absolutely. 
***
63 Question: How 

In [None]:
# This will take just under 2 minutes
df_avatar['ada_similarity'] = df_avatar.combined.apply(lambda x: get_embedding(x, engine='text-similarity-ada-001'))
df_avatar['ada_search'] = df_avatar.combined.apply(lambda x: get_embedding(x, engine='text-search-ada-doc-001'))
df_avatar['ada_search'] = df_avatar.ada_search.apply(eval).apply(np.array)

## Using speech

In [73]:
import speech_recognition as sr
r = sr.Recognizer()
mic = sr.Microphone()
sr.Microphone.list_microphone_names()
# with mic as source:
#     # r.adjust_for_ambient_noise(source)
#     audio = r.listen(source)

['Alberto Maria Chierici’s AirPods Max',
 'Alberto Maria Chierici’s AirPods Max',
 'MacBook Pro Microphone',
 'MacBook Pro Speakers',
 'ZoomAudioDevice',
 'Multi-Output Device']

In [75]:
# r.recognize_google(audio)

In [79]:
def run_toia_speak(dataset, doc_emb):
    print("TOIA starts")

    while True:
        # query = input('> ')
        with mic as source:
            print("you may speak ...")
            # r.adjust_for_ambient_noise(source)
            audio = r.listen(source)
            query = r.recognize_google(audio)
            print(f"(you said {query})")
        if query == "stop":
            break

        output = answer_retrieval(doc_emb, query, n=1, pprint=False)
        if output is None:
            break

        print(f"avatar: {output.answer}, sim={output.similarities}")

In [109]:
def run_toia_type(dataset, doc_emb):
    print("TOIA starts")

    while True:
        query = input('> ')
        if query == "stop":
            break

        output = answer_retrieval(doc_emb, query, n=1, pprint=False)
        if output.similarities.values[0] > 0.29:
            print(output.answer.values[0], output.id_video.values[0], f"""ada_search_sim: {output.similarities.values[0]}""")
        else:
            df_noanswers = dataset[dataset['type'] == "no-answer"]
            if df_noanswers.shape[0] > 0:
                answers = df_noanswers.sample(n=1)
                print(answers['answer'].values[0], answers['id_video'].values[0], f"""ada_search_sim: {output.similarities.values[0]}<=0.29""")
            else:
                print("You haven't recorded no-answers", "204", "No Content")
        if output is None:
            break

In [110]:
run_toia_type(df_avatar, df)

TOIA starts


>  what's up


Sorry. I don't believe I recorded an answer for that. toia7_2_2383_6413225f.mp4 ada_search_sim: 0.2710058519302774<=0.29


>  how are you


Hi, how are you? toia7_2_2381_d04ae133.mp4 ada_search_sim: 0.33942525691322323


>  can I call you...


You can call me, Alberto. toia7_2_817_9727d0b6.mp4 ada_search_sim: 0.37583155240997373


>  nice to meet you, Alberto


Likewise, it's nice to meet you. toia7_2_1975_2197353f.mp4 ada_search_sim: 0.4499518431837456


>  what do you do


I work at Tesla in data science. toia7_2_18_4929188a.mp4 ada_search_sim: 0.3450081613046201


>  do you like Tesla


I thought it was an innovative company, plenty of very intelligent people. toia7_2_50_8eb05eca.mp4 ada_search_sim: 0.38385999470416393


>  instead Tesla is


I thought it was an innovative company, plenty of very intelligent people. toia7_2_50_8eb05eca.mp4 ada_search_sim: 0.36271147399700027


>  How did you find the job


Yes, true. I thank you for your question. I started my career working as an actuary in insurance companies in London. I am mainly working preisinger. I did a picture of reserving and the captain Melody. What's the menu pricing in the claims only text. I was very excited about the programming and the appliance tomatoes, to make sense of a real-world other rooms and use them to make sound business decision. So I quickly turned my career into the science of myself for a machine-learning. I took the famous Crusader class b. S o n g in Stamford. I learned. Cold better. Will you buy something or are you joined the Deloitte Consulting? The we're at the time they were dating ideas and projects in the cross section between a trailer on it, except for financial institutions that especially in the truck companies. So I joined the team. I you worked on that machine learning platform, used it for cinematic, send the application, the motor insurance. It was extremely interesting. How you learned a l

>  Wow


Sorry, I didn't record answers to that question. toia7_2_12_d7c9f2a0.mp4 ada_search_sim: 0.257491265205159<=0.29


>  what else can I ask you


I can talk about my favorite food, my career. Things I like to read. toia7_2_2380_5f419c00.mp4 ada_search_sim: 0.3390231219708599


>  what do you like to eat


I like eating, I'm a foodie and I love exploring new restaurants, New Cuisines and hanging out with friends and very much people oriented person. toia7_2_67_3231544b.mp4 ada_search_sim: 0.3592123848340238


>  favorite food?


My favorite food is steak and fries. toia7_2_1354_2bbe3613.mp4 ada_search_sim: 0.4039955076081348


>  favorite italian restaurant


Yes. toia7_2_469_59a0a90a.mp4 ada_search_sim: 0.37186446969308784


>  how old are you


Sorry. I don't believe I recorded an answer for that. toia7_2_2383_6413225f.mp4 ada_search_sim: 0.2763933110704215<=0.29


>  How old are you?


Can you try rephrasing the question? toia7_2_47_694c7c7d.mp4 ada_search_sim: 0.2844862006389171<=0.29


>  Where are you from?


I live in Abu Dhabi in the United Arab Emirates. toia7_2_69_1d0e76d3.mp4 ada_search_sim: 0.31694524485783543


>  Where you born in Abu Dhabi


I live in Abu Dhabi in the United Arab Emirates. toia7_2_69_1d0e76d3.mp4 ada_search_sim: 0.4317242113723543


>  Where were you born


I was born in Milan in 1985. toia7_2_818_8d17409f.mp4 ada_search_sim: 0.3715776840215832


>  Ho, you're Italian


Sorry. I don't believe I recorded an answer for that. toia7_2_2383_6413225f.mp4 ada_search_sim: 0.2865998144814632<=0.29


>  Are you Italian


Yes. toia7_2_469_59a0a90a.mp4 ada_search_sim: 0.3524511932374348


>  HAve you traveled to other countries


Abu Dhabi. I love exploring local emiratis restaurant. And there's some of the high-end, Italian ones. toia7_2_450_716d2a2c.mp4 ada_search_sim: 0.2971948343563883


>  Tell me a good ITalian restaurand in Abu DHabi


I love Antonia, which is an Italian restaurant in Saadiyat Island. toia7_2_249_c8ea9aeb.mp4 ada_search_sim: 0.387465368312768


>  Do you like music


I like a bit of everything, from rock music, music from the 80s, classical music, pop music. One of my favorite artists is Lady Gaga. toia7_2_26_14fcad54.mp4 ada_search_sim: 0.371780991269171


>  stop


### Notes

* Threshold seems to be 0.3187 or just 0.31.
    * Need to try more, or manually annotate with different toias.
* Need some NER (can use spaCy): e.g., what do you think of Donald Trump vs. ...of Elon Musk, yes, retrieves an answer that makes sense but I have very different opinoions about the two characters, so we need to make sure we recognize the right one! 