(https://stackabuse.com/python-for-nlp-creating-a-rule-based-chatbot/)

## scan variables for possible things to plot

In [None]:
import pandas as pd
import numpy as np

In [None]:
def is_number(x):
    result = isinstance(x, float) or isinstance(x, int)
    return result

In [None]:
def all_numbers(my_list):
    return all(map(is_number, my_list))

In [None]:
def get_plotting_candidates():
    candidates = []
    for n,v in globals().items():
        if isinstance(v, pd.DataFrame):
            candidates.append(n)
            candidates.extend(n + "['" + field + "']" for field in v.columns)
            candidates.extend(n + '["' + field + '"]' for field in v.columns)
        elif isinstance(v, list):
            if all_numbers(v) and len(v)>0:
                candidates.append(n)
        elif isinstance(v, np.ndarray):
            if len(v.shape)==1:
                candidates.append(n)
            elif len(v.shape)==2:
                candidates.append(n)
            else:
                pass 
    return candidates

In [None]:
# this--once written--could also support slices of numpy arrays like nn[1]
# this is impractical in get_plotting_candidates()
def is_plotting_candidate(var_name):
    pass # do later

### scan for variable names using regex

In [None]:
import re
def var_names_by_regex(in_string):
    import re
    pattern1 = re.compile(r"""of +([a-z\[\]'"0-9]+)""", re.IGNORECASE)
    match1 = pattern1.findall(in_string)
    result1 = match1
    
    pattern2 = re.compile(r"""plot +([a-z\[\]'"0-9]+)""", re.IGNORECASE)
    match2 = pattern2.findall(in_string)
    result2 = [x for x in match2 if x.lower() != 'of']

    result = result1 + result2
    return result
    

# Input data for the state machine

In [None]:
allowed_states = """
entry: plot, bar, add_legend, add_legend_top_right, add_legend_top_left

plot: entry
bar: entry
add_legend: add_legend_top_right, add_legend_top_left, entry
add_legend_top_right: entry
add_legend_top_left: end

"""

In [None]:
state_requires = {
    'add_legend': ({'plot', 'bar'}, "please plot something first"),
    'add_legend_top_right': ({'plot', 'bar'}, "please plot something first"),
    'add_legend_top_left': ({'plot', 'bar'}, "please plot something first"),    
}

In [None]:
def plot_command(in_string):
    names = var_names_by_regex(in_string)
    if len(names)==1:
        name = names[0]
        if name in get_plotting_candidates():
            result = ["plt.plot("+name+")"]
            return result, True
        else:
            print(name, "does not seem to be a printable variable")
            return [], False
    else:
        print("Found either too few or too many potential variables", names)
        return [], False

In [None]:
state_to_command = {
    'entry': lambda x: ([],True),
    'plot': plot_command,
    'bar': lambda x: (["plt.bar(x, height)"],True),
    'add_legend': lambda x: ([],True),
    'add_legend_top_right': lambda x: (["plt.legend(['test'], loc='upper right')"],True),
    'add_legend_top_left': lambda x: (["plt.legend(['test'], loc='upper left')"],True),
}

In [None]:
state_user_message = {
    'add_legend': "Would you like to place the legend to the left or the right?"
}

In [None]:
state_phrases = {
    'plot': ["make a line plot", "draw a line plot", "create a line plot", "Plot x"],
    'bar': ["make a bar chart", "create a bar plot", "bar plot"],
    'add_legend': ["add legend", "add description"],
    'add_legend_top_right': ["add legend top right", "add description top right"],
    'add_legend_top_left': ["add legend top left", "add description top left"],
}

# Parse input

In [None]:
def states_string_to_data_structure(input_string):
    temp1 = [ x.strip() for x in input_string.split('\n') if x.strip() != "" ]
    temp2 = [ x.split(':') for x in temp1 ]
    def f(x):
        result = [ y.strip() for y in x.split(',') if y.strip() != "" ]
        return result
    states = { x[0].strip() : f(x[1]) for x in temp2 }
    return states

In [None]:
states = states_string_to_data_structure(allowed_states)

In [None]:
phrase_to_state = [ (v,k) for k, v_list in state_phrases.items() for v in v_list ]
all_phrases = [ k for k,v in phrase_to_state ]

## Transform to sentence vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import nltk
import string
# this cell is 1:1 from the blog
wnlemmatizer = nltk.stem.WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [None]:
#word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
word_vectorizer = TfidfVectorizer()
#all_word_vectors = word_vectorizer.fit_transform(article_sentences)

In [None]:
word_vectorizer.fit(all_phrases)

In [None]:
phrase_vector_to_state = [ (word_vectorizer.transform([k])[0], v) for k,v in phrase_to_state ]

In [None]:
def get_closest_command(input_string):
    input_vector = word_vectorizer.transform([input_string])
    all_distances = [(cosine_similarity(input_vector, command_vector)[0][0], command_name) 
                     for command_vector, command_name 
                      in phrase_vector_to_state ]
    max_command = max(all_distances, key = lambda l: l[0])
    return max_command

# Data Structures for plotting -- to be replaced

In [None]:
x= [1,2,4,5,6]
height = [1,1,1,2,2,]

In [None]:
df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})

In [None]:
nn = np.random.normal(size=(2,3))

# Input loop

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
flag=True
curr_state = 'entry'
all_states = ['entry']
all_raw_inputs = []
all_commands = []

while(flag):
    # current_state is from last round
    possible_next_steps = states[curr_state]
    if len(possible_next_steps) > 1:
        print("please select from", possible_next_steps)
        last_in_raw = all_raw_inputs[-1] if len(all_raw_inputs)>0 else ""
        my_in_raw = input()
        
        # get the best answer for the current command
        rating_pure, my_in_pure = get_closest_command(my_in_raw)
        # as well as a concatenation with the last command
        rating_conc, my_in_conc = get_closest_command(last_in_raw + " " + my_in_raw)
        # take the one with the higher rating
        if rating_pure > rating_conc:
            rating = rating_pure
            my_in  = my_in_pure
        else:
            rating = rating_conc
            my_in  = my_in_conc
        
        if rating < 0.6:
            my_in = 'UNK'
    else:
        my_in = possible_next_steps[0]
    
    if my_in_raw.lower() == 'end' or my_in.lower() == 'end':
        flag = False
    elif my_in not in possible_next_steps:
        print("Sry, couldn't understand you!")
        continue
    else: # so my_in is now in possible_next_steps
        required_states = state_requires.get(my_in,[None])[0]
        if (required_states is None) or (required_states & set(all_states)):
            print(state_user_message.get(my_in,my_in))
            all_states.append(my_in)
            all_raw_inputs.append(my_in_raw)
            
            # get the new commands - 
            new_commands, success_flag = state_to_command.get(my_in, lambda x: ([],True))(my_in_raw)
            if not success_flag:
                print("something went wrong")
                curr_state = 'entry'
                continue
            all_commands.extend(new_commands)
            #
            [ eval(bla) for bla in all_commands ]
            plt.show()
            #
            curr_state = my_in
        else:
            print(state_requires.get(my_in)[1])
            curr_state = 'entry' # was: required_state. Now there is more than one. What to do?
print('bye')
print('\n'.join(all_commands))

# To be incorporated in the future

# Testing

In [None]:
get_closest_command("make a line plot")

In [None]:
get_closest_command("draw me al ine")

In [None]:
get_closest_command("make me a nice bar cha")

## potential ways to scan for variable names

In [None]:
testsentences = ["make a line plot of df['hello']", "draw a line plot Xy", "create a line plot of x", "Plot x", "plot x[3]"]

In [None]:
list(map(var_names_by_regex, testsentences))

# Graveyard

## NLTK start

In [None]:
import nltk
import numpy as np
import random
import string

In [None]:
article_text = "Hello, hello. How are you?"

article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)


In [None]:
article_sentences

In [None]:
article_words

In [None]:
wnlemmatizer = nltk.stem.WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [None]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]

def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

In [None]:
def generate_response(user_input):
    tennisrobo_response = ''
    article_sentences.append(user_input)

    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        tennisrobo_response = tennisrobo_response + "I am sorry, I could not understand you"
        return tennisrobo_response
    else:
        tennisrobo_response = tennisrobo_response + article_sentences[similar_sentence_number]
        return tennisrobo_response

### above is a nice function, but I'll first do it quick and dirty

## try spacy

In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core_web_sm')
doc = nlp("my friend Mary has worked at Google since 2009")
doc2 = nlp("make the markers blue")
print(doc2)
for ent in doc2.ents:
    print(ent.text, ent.label_)

In [None]:
import spacy
spacy.__version__

### from https://gist.github.com/vu3jej/a46eb3d18aa7d8c808af8b8ca4df06a4

In [None]:
class ColourExtractorStrict:
    """Extract colours along with adjectives"""

    def __init__(self, colours):
        self.colours = colours
        self.pos_ok = ['ADJ', 'NOUN']
        self.tagger = spacy.load('en_core_web_sm')

    def get(self, string):
        extracted = set()
        doc = self.tagger(string.lower())
        pairs = [(word.text, word.pos_) for word in doc]
        for index, pair in enumerate(pairs):
            text, pos = pair
            if text in self.colours:
                text_ahead = self.look_ahead(pairs=pairs, index=index)
                text_behind = self.look_behind(pairs=pairs, index=index,
                                               colour_pos=pos)
                if text_behind:
                    text_behind.append(text)
                    if text_ahead:
                        text_behind.extend(text_ahead)
                        extracted.add(' '.join(text_behind))
                    else:
                        extracted.add(' '.join(text_behind))
                elif text_ahead:
                    extracted.add(' '.join([text] + text_ahead))
                else:
                    extracted.add(text)

        return extracted if extracted else False

    def look_ahead(self, pairs, index):
        ahead = list()
        for text, pos in pairs[index + 1:]:
            if pos in self.pos_ok:
                ahead.append(text)
            else:
                break

        return ahead if ahead else False

    def look_behind(self, pairs, index, colour_pos):
        behind = list()
        for text, pos in reversed(pairs[:index]):
            if pos in self.pos_ok:
                behind.append(text)
            else:
                break

        return list(reversed(behind)) if behind else False

In [None]:
colours = ['blue', 'pink', 'lavender', 'heather']
extractor = ColourExtractorStrict(colours=colours)
string = 'Available in a variety of colors, including bold blue heather, ebony, jazzberry pink heather, light steel, navy heather, new frosty lavender, plum port or slate heather'
string = "make the markers blue"
extractor.get(string=string)

## try individual functions

In [None]:
import re
def tokenize_sentence(s):
    s = re.sub('[^\w\s]', '', s)
    s = re.sub('\s+', ' ', s)
    return s.strip().split(' ')

In [None]:
tokenize_sentence("Hello, hello! it is a me, Mario")