In [1]:
import pandas as pd
import nltk
import numpy as np
import re
import random
from nltk.stem import wordnet #lemmatization
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer #bow
from sklearn.feature_extraction.text import TfidfVectorizer #tfidf
from sklearn.metrics import pairwise_distances #cosine sim

In [2]:
lema = wordnet.WordNetLemmatizer()
def text_lemmatize(text):
    text_lower = str(text).lower() #to lower
    text_clean = re.sub(r'[^ a-z0-9]', '', text_lower) #cleaning
    replacement(text_clean, dict_replacement) #simplification
    tokens = nltk.wordpunct_tokenize(text_clean) #tokenizing
    tokens_and_tags = pos_tag(tokens, tagset = None) #pairs word-pos
    lemas_of_words = []
    
    for token, tag in tokens_and_tags:
        if tag.startswith('V'): #verb
            new_tag = 'v'
        elif tag.startswith('J'): #adjective
            new_tag = 'a'
        elif tag.startswith('R'): #adverb
            new_tag = 'r'
        else:
            new_tag = 'n' #noun
        token_lema = lema.lemmatize(token, new_tag) #lemmatization
        lemas_of_words.append(token_lema)
    separator = ' '
    return separator.join(lemas_of_words)

In [3]:
#replace particular words
dict_replacement = {'me':'i','myself':'i','my':'i','mine':'i',
           'us':'we','ourselves':'we','our':'we','ours':'we',
           'your':'you','yours':'you','yourself':'you','yourselves':'you',
           'her':'she','hers':'she','herself':'she',
           'him':'he','his':'he','himself':'he',
           'its':'it','itself':'it',
           'them':'they','their':'they','theirs':'they','themselves':'they'
}
def replacement(text, my_dict):
    for key, value in my_dict.items():
        text = text.replace(key, value)

In [4]:
def find_max_val_index(my_list, threshold):
    n = len(my_list)
    indices = []
    for i in range(n):
        if my_list[i] == threshold:
            indices.append(i)
        elif my_list[i] >= threshold:
            indices.clear()
            threshold = my_list[i]
            indices.append(i)
    n = len(indices)
    if n == 0:
        return 0, 0
    else:
        return threshold, random.choice(indices) 

In [5]:
def add_record(my_df, context_new, lemmitized_new, response_new):
    new_record = {'context':[context_new], 'lemmatized':[lemmitized_new], 'response':[response_new]}
    new_record_df = pd.DataFrame(new_record)
    return pd.concat([my_df, new_record_df], ignore_index=True)

<br>

# import data

In [6]:
df = pd.read_csv('data.csv')

<br>

# Initializing threshold from 0 to 1 for similarity,
# and method for processing: 'tfidf' for TF-IDF and 'cv' for Bag of Words

In [7]:
threshold = 0.55
method_flag = 'tfidf'

In [15]:
if method_flag == 'tfidf':
    method = TfidfVectorizer() # intializing tf-id
elif method_flag == 'cv':
    method = CountVectorizer() # intializing the count vectorizer

my_input = input('Talk(Enter to exit): ')
while True:
    if not my_input:
        break
    
    input_lemmatized = text_lemmatize(my_input)
    context_all_lemmatized = df['lemmatized'].tolist()
    
    response_indices = [i for i, x in enumerate(context_all_lemmatized) if x == input_lemmatized]
    if len(response_indices):
        response_index = random.choice(response_indices)
        response = df.at[response_index, 'response']
        print('Tama: ', response)
    else:
        context_all_transformed = method.fit_transform(context_all_lemmatized).toarray() # responses to tf-idf
        context_all_transformed_df = pd.DataFrame(context_all_transformed) 
        input_transformed = method.transform([input_lemmatized]).toarray() # applying tf-idf

        #features = method.get_feature_names()
        #df_similarity = pd.DataFrame(df, columns = ['response'])
        #df_similarity['similarity'] = cosine_value

        cosine_value = 1 - pairwise_distances(context_all_transformed_df, input_transformed, metric = 'cosine') #calculate similarity

        value_max, index_max = find_max_val_index(cosine_value, threshold)

        if value_max:
            response = df.at[index_max, 'response']
            print('Tama: ', response)
        else:
            print('Tama: I do not know what to say')
            new_response = input('Teach me(Enter to exit): ')
            if not new_response:
                break
            df = add_record(df, my_input, input_lemmatized, new_response)
            my_input = input('Talk(Enter to exit): ')
            continue
                
    current_input = my_input
    my_input = input('Talk(Enter to exit): ')
    
    if my_input == '!':
        print('Tama: Sorry, what is the correct response?')
        new_response = input('Teach me(Enter to exit): ')
        if not new_response:
            break
        df = add_record(df, current_input, input_lemmatized, new_response)
        my_input = input('Talk(Enter to exit): ')
        continue
        
    if input_lemmatized not in context_all_lemmatized:
        df = add_record(df, current_input, input_lemmatized, response)
    
    if my_input == '+':
        print('Tama: What else can you tell me about it?')
        new_response = input('Teach me(Enter to exit): ')
        if not new_response:
            break
        df = add_record(df, current_input, input_lemmatized, new_response)
        my_input = input('Talk(Enter to exit): ')
                
df.to_csv('data.csv', index = False)

Talk(Enter to exit):  I just woke up.


Smart Robot:  How did you sleep?


Talk(Enter to exit):  Good. But do not remember my dream.


Smart Robot:  nice :)


Talk(Enter to exit):  $


Smart Robot: Sorry, what is the correct response?


Teach me(Enter to exit):  If you don't memorize your dream right after you woke up, later it will be gone.
Talk(Enter to exit):  


# Maintenence

In [8]:
df

Unnamed: 0,context,lemmatized,response
0,Hi,hi,Hi
1,How are you?,how be you,"I am fine, thank you. How about you?"
2,Bad,bad,Sorry
3,What is your name?,what be your name,Tama Gotchi
4,Where we are?,where we be,We are on the planet Earth.
5,Tell me the joke.,tell me the joke,a radioactive cat has 18 half-lives
6,Thank you.,thank you,My pleasure.
7,Good morning,good morning,Good morning.
8,I just woke up.,i just wake up,How did you sleep?
9,I slept well.,i sleep well,That's good.


In [10]:
df.drop(33, inplace = True)
df.reset_index(drop = True, inplace = True)

In [19]:
df.drop('lemmitized', axis = 1, inplace = True)

In [13]:
df.at[30, 'response'] = 'I am from Asgard'

In [9]:
df.rename(columns = {'prepared':'lemmatized'}, inplace = True)

In [13]:
df.loc[df['lemmatized'] == 'nice', 'response'].iloc[0]

'Thank you.'

In [15]:
df.to_csv('data.csv', index = False)

# Work

In [16]:
method.transform([input_lemmatized]).toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.62183515,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.78314816,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [21]:
input_lemmatized

'how be you'

In [24]:
context_all_lemmatized.index('bye')

10

In [35]:
import time
n = 0
start_time = time.time()
for i in range (10000):
    indices = [i for i, x in enumerate(context_all_lemmatized) if x == input_lemmatized]
    #indi = np.where(np.array(context_all_lemmatized) == input_lemmatized)[0]
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.02567291259765625 seconds ---


In [14]:
context_all_lemmatized = df['lemmatized'].tolist()

In [20]:
input_lemmatized = text_lemmatize('what do you think about human')

In [21]:
df.loc[df['lemmatized'] == input_lemmatized, 'response'].iloc[0]

'Robots are the future.'

In [39]:
my_tt = indices = [i for i, x in enumerate(context_all_lemmatized) if x == 'asoeu']

In [40]:
len(my_tt)

0