# Title

## Setup

In [1]:
# from flask import Flask, request, render_template, url_for
# import os
import argparse
import random
import re
import json 
import linecache
from transformers import pipeline, set_seed
from transformers import BertTokenizer, BertForNextSentencePrediction
import nltk 
from nltk import tokenize
import ssl
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)

# #Local storage of the conversation data - will be deprecated once the database is in place
# storage = []

# starters = ["What topics would you like to talk about?", "What are your hobbies?", "Where did you study?"]

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package punkt to /Users/amc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Q Generator

In [2]:
generator = pipeline('text-generation', model='gpt2')

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The function below isn't used for now...

## Try it out

In [50]:
import sqlalchemy as db
from sqlalchemy.sql import text as QueryText

SQL_URL = "mysql+pymysql://root:ReallyComplicatedPassword@localhost:3307/toia"

ENGINE = db.create_engine(SQL_URL)

CONNECTION = ENGINE.connect()

In [30]:
statement = QueryText(
    "SELECT question FROM questions \
        WHERE trigger_suggester = 1 \
    ;"
)

result_proxy = CONNECTION.execute(statement)

result_set = result_proxy.fetchall()

In [31]:
starters = [qs[0] for qs in result_set]
print(starters)

['What is your name?', 'Where and when were you born?', 'What do you do for a living?']


In [55]:
CONNECTION = ENGINE.connect()  #Need to refresh connection

statement = QueryText("""
    SELECT CONCAT(questions.question, " ", video.answer) AS latest_question_answer 
    FROM video
    INNER JOIN videos_questions_streams
    ON videos_questions_streams.id_video = video.id_video
    INNER JOIN questions
    ON questions.id = videos_questions_streams.id_question
    WHERE toia_id=1 
    AND questions.trigger_suggester = 1
    ORDER BY video.idx DESC LIMIT 1;
    """)

result_proxy = CONNECTION.execute(statement)

result_proxy.fetchall()

[('What do you do for a living? I work as a data scientist.',)]

In [52]:
def nb_trial_generateNextQ(text, generated_seqs=5):
# text = "What's your name? My name is Alberto."
    
    statement = QueryText("""
        SELECT CONCAT(questions.question, " ", video.answer) AS latest_question_answer 
        FROM video
        INNER JOIN videos_questions_streams
        ON videos_questions_streams.id_video = video.id_video
        INNER JOIN questions
        ON questions.id = videos_questions_streams.id_question
        WHERE toia_id=1 
        AND questions.trigger_suggester = 1
        ORDER BY video.idx DESC LIMIT 1;
        """)

    CONNECTION = ENGINE.connect()  #Need to refresh connection
    result_proxy = CONNECTION.execute(statement)

    latest_qa = result_proxy.fetchall()[0][0]

    # storage.append(text)

    # if len(starters) > 0: 
    #     print("SENDING STARTER")
    #     return {"q":starters.pop()}

    # else: 

    # text = " ".join(storage[-2:])
    if len(latest_qa) > 0:
        text = latest_qa + text
        
    q = generator(text, 
                  num_return_sequences=generated_seqs, 
                  max_length=50 + len(text))

    #all generated examples 
    allGenerations = ""
    for i in range(generated_seqs):
        allGenerations = allGenerations + " " + q[i]['generated_text'][len(text) - 4:]

    #Separating all the sentences... 
    sentenceList = nltk.tokenize.sent_tokenize(allGenerations)

    #Filter out questions 
    questionsList = []
    for sentence in sentenceList :
        if "?" in sentence:
            questionsList.append(sentence.strip("\n").strip("\\").strip('"'))

    #Bert evaluation
    bert_filtered_qs = []
    for sentence in questionsList:
        encoding = tokenizer(" ".join(storage[-3:]), sentence, return_tensors='pt')
        outputs = model(**encoding)
        logits = outputs.logits
        bert_filtered_qs.append((logits[0,0].item(), sentence))


    bert_filtered_qs.sort(key=lambda tup: tup[0], reverse=True)

    # print(bert_filtered_qs[:5])
    # print({"q":questionsList})

    no_suggestions = min(len(bert_filtered_qs) - 1, 5)

    suggestions = [bert_filtered_qs[i][1] for i in range(no_suggestions) if bert_filtered_qs[i][1] != bert_filtered_qs[i + 1][1]]

    # return {"q":bert_filtered_qs[-1][1]}
    return suggestions


In [56]:
nb_trial_generateNextQ("Where did you take your degrees from? I've got a Phd at NYU.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["In your past work with the cryptocurrency scene, you've had a NYU.What are some ways your students study about a specific topic?",
 "I've studied in a university department.What was your most important job decision?I was looking to work for a job.What was your most recent experience as a data scientist?",
 "I worked on the last 10 years in the data sciences.What's your biggest challenge here?",
 'Where do you keep track?',
 "Then I asked if he was willing to work at this company, since we owned several insurance companies now and I'd give NYU.Did you see Dr. Oz?I am going to spend a lot of time at home.I want to be productive, I want to have fun, but all those things we're not able to do is live with our lives and our job."]

In [19]:
print(storage)
text = "Where did you graduate? At NYUAD"
storage.append(text)
text = " ".join(storage[-2:])
print(text)

['What is your name? My name is Alberto.', 'Where and when were you born? I was born in Italy the eighties.', 'What do you do? I work as a data scientist.']
What do you do? I work as a data scientist. Where did you graduate? At NYUAD


In [25]:
generator("What is your name? My name is Alberto.", num_return_sequences=3, max_length=50 + len(text))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello! Hi. What is your name? My name is Alberto. I was born on July 26th and I want to talk with your mother. My name is Irene. I want the name of my father, and his name is Alberto. I was born on July 18th and I want to talk with your sister. My name is Roseie. I'm not sure who Rose will be, but to tell you the truth, we didn't get along until about a year ago. We had a mutual crush on each other, but we all grew up together. Then, suddenly in 2001, it happened that we divorced"},
 {'generated_text': "Hello! Hi. What is your name? My name is Alberto. I'm from Brazil. Hello, everyone. My name is Pablo Cava. I am from Argentina. And I'm from Paraguay. Welcome to our office here in Brazil. I want to ask you a couple of questions.\n\nYou said to us just after this was announced, before we had the final episode, that you would be coming back and taking a break. So what's your name today?\n\nThis is the name of my first name, José Cava. This is what I did back when we 

In [21]:
generator(text, num_return_sequences=4, max_length=50 + len(text))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What do you do? I work as a data scientist. Where did you graduate? At NYUADU?\n\n\nBryan: I got my BA from New York City Law School, and I worked with [David] Sutter at the University of California, Berkeley. It was a perfect semester. I applied to law school in a few years. When I applied to law class after class, I went to law school as an undergrad, doing a lot of work. But when I was in grad school, I did some more law. I had some experience in litigation and litigation for a company called Glynn (Hicks'},
 {'generated_text': "What do you do? I work as a data scientist. Where did you graduate? At NYUAD?\n\nI moved back to NYC where I still worked with John Zetterbeck as VP of Marketing. He didn't have an MBA from NYU, but had started out as a data scientist (as they say). I'm a scientist with a passion to solve problems. I had a master's degree last year in analytics from NYUAD, which means I'm working from that class of NYU students, and doing an internship. 