In [33]:
!pip install pandas



In [34]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [35]:
# useful downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('stopwords')

nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\wangx\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already

True

# Small Talk

In [36]:
#load the dataframe
#some basic responses on small talks
df = pd.read_excel('dialog_talk_agent.xlsx')
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.

# function that performs text normalization steps
# lower case + remove special characters + tokenization + lemmatization
def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text


# defining a function that returns response to query using tf-idf
tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names_out()) 

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index number of highest value 
    return df['Text Response'].loc[index_value]

# Question Answering

In [66]:
#load data

import json
import matplotlib.pyplot as plt
import numpy as np
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def read_corpus():
    """
    read given corpus，write the question list and answer list into qlist, alist 
    qlist = ["Q1"， “Q2”， “Q3” ....]
    alist = ["A1", "A2", "A3" ....]
    answer matches questions
    """
    qlist = []
    alist = []
   
    with open("squad2.0/train-v2.0.json") as f:
        all_data = json.load(f)['data']
        for data in all_data:
            paragraphs = data['paragraphs']
            for paragraph in paragraphs:
                for qa in paragraph['qas']:
                    # print(qa['id'])
                    if qa['answers']:
                        qlist.append(qa['question'])
                        alist.append(qa['answers'][0]['text'])
    assert len(qlist) == len(alist)  # same length
    print("Load question and answer success. The length :{}".format(len(qlist)))
    return qlist, alist

original_qlist, alist = read_corpus()

Load question and answer success. The length :86821


In [67]:
word_freq = {}#word frequency
for question in original_qlist:
    question = question.replace('?', ' ?')
    line = question.strip().split()
    for word in line:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
sort_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

In [68]:
new_qlist = []
new_alist = []
stopwords =[]
porter_stemmer = PorterStemmer()
##load stop words
with open("NLTK's20of20stopwords.txt") as f1:
    lines = f1.readlines()
    for line in lines:
        stopwords.append(line.strip())
stopwords = set(stopwords)

for question in original_qlist:
    tmp = ''
    for sign in ['.', '?', '/', '#', '$', '@', '^', '*', '!', '(', ')']:
        question = question.replace(sign, '')
    # question = question.replace('?', ' ?')
    line = question.strip().split()
    for word in line:
        try:
            if word_freq[word] <= 20:#frequency larger than 20
                continue
            word = word.lower()
        except:
            pass
        if word in stopwords:
            continue
        for num in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:#contains numbers
            if str(num) in word:
                word = '1'
        word = porter_stemmer.stem(word)# stemming 
        tmp = tmp + word + " "
    new_qlist.append(tmp[:-1])
    
qlist = new_qlist# upadated list
print("Preprocessing completed！")

Preprocessing completed！


In [59]:
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, norm='l2') # define one vectorizer of tf-idf

tf_idf_model = vectorizer.fit(qlist)
X = tf_idf_model.transform(qlist)
X = X.toarray()

In [69]:
def topresults_invidx(input_q):

    #Use the inverted table to filter out the index of candidate issues 
    quest = input_q.replace('?', ' ?')
    line = quest.strip().split()
    doc_lst = range(len(qlist))

	#Get the candidates containing the first k characters entered by the user
    # for j in range(k):
    #     #try:
    #     word = line[j]
    #     doc = inverted_idx[word]
    #     doc_lst = list(set(doc_lst) & set(doc))
       

    # preprocessing
    tmp = ''
    for sign in ['.', '?', '/', '#', '$', '@', '^', '*', '!', '(', ')']:
        question = input_q.replace(sign, '')
    # question = question.replace('?', ' ?')
    line = question.strip().split()
    for word in line:
        try:
            if word_freq[word] <= 20:
                continue
            word = word.lower()
        except:
            pass
        
        for num in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
            if str(num) in word:
                word = '1'
        if word in stopwords:
            continue
        word = porter_stemmer.stem(word)
        tmp = tmp + word + " "
    input_str = [tmp[:-1]]
    
	#representing user input
    input_str = tf_idf_model.transform(input_str).toarray()[0]

    simlarity = {}
    for index in doc_lst:
        cos_sim = np.dot(input_str, X[index]) / (np.linalg.norm(input_str) * np.linalg.norm(X[index]) + 1)
        simlarity[index] = cos_sim

    top_idxs = []  # top_idxs store highest similarity

    simlarity = list(sorted(simlarity.items(), key=lambda x: x[1], reverse=True))

    for _ in range(1):
        
        index, cos = simlarity[_]
        top_idxs.append(index)

        

    return [alist[indx] for indx in top_idxs]  # highest answers

# TODO: test and print results
# print (topresults_invidx("which president won all of NYC in 1924?"))
# print(topresults_invidx("what is Harper Lee's  hometown?"))
# print(topresults_invidx("How many people lived in Kathmandu in 2011?"))
# print(topresults_invidx("Which area did Beyonce compete for when she was young?"))
# print(topresults_invidx("Which area did Beyonce compete for when she was young"))



# Intent Routing

In [42]:
qlist = original_qlist[0:1591]

import pandas as pd
from sklearn.model_selection import train_test_split

small_talk_context = list(df['Context'])[0:1591]
intent_data = small_talk_context
intent_labels = []

for _ in range(1591):
    intent_labels.append('small talk') 
len(intent_labels)

1591

In [43]:
for q in qlist:
    intent_data.append(q)
for _ in range(len(qlist)):
    intent_labels.append('question answering') 

#print(len(intent_data),len(intent_labels))
print(len(intent_data))
print(len(intent_labels))

3182
3182


In [44]:
X_intent_train, X_intent_test, Y_intent_train, Y_intent_test = train_test_split(intent_data, intent_labels, stratify=intent_labels, test_size=0.2, random_state=42)

In [45]:
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

p_stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (p_stemmer.stem(w) for w in analyzer(doc))

intent_count_vect = CountVectorizer(lowercase=True, stop_words=stopwords.words('english'), analyzer=stemmed_words)
X_intent_train_counts = intent_count_vect.fit_transform(X_intent_train)

print(X_intent_train_counts)
## Weighting
from sklearn.feature_extraction.text import TfidfTransformer

intent_tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=True).fit(X_intent_train_counts)
X_intent_train_tf = intent_tfidf_transformer.transform(X_intent_train_counts)

print(X_intent_train_tf)
## Training a classifier
from sklearn.linear_model import LogisticRegression

intent_clf = LogisticRegression(random_state=0).fit(X_intent_train_tf, Y_intent_train)
## Evaluating a classifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Preprocessing documents and creating term-document matrix
X_intent_new_counts = intent_count_vect.transform(X_intent_test)

# Weighting
X_new_tfidf = intent_tfidf_transformer.transform(X_intent_new_counts)

# Predict on the test set
predicted = intent_clf.predict(X_new_tfidf)

# Print metrics
print(confusion_matrix(Y_intent_test, predicted))
print(accuracy_score(Y_intent_test, predicted))
print(f1_score(Y_intent_test, predicted, pos_label='small talk'))

# Using the classifier on new data
#Preprocessing and creating term-document matrix
intent_new_data = ["who are you?"]
intent_processed_newdata = intent_count_vect.transform(intent_new_data)

#Weighting
processed_newdata = intent_tfidf_transformer.transform(intent_processed_newdata)

# Predict
print(intent_clf.predict(processed_newdata))
print(''.join(intent_clf.predict(processed_newdata)))

  (0, 139)	1
  (0, 1764)	1
  (0, 146)	1
  (0, 1798)	1
  (0, 291)	1
  (0, 180)	1
  (0, 1809)	1
  (0, 1234)	1
  (0, 346)	1
  (0, 1882)	1
  (0, 353)	1
  (0, 1909)	1
  (1, 882)	1
  (1, 1770)	1
  (2, 1882)	1
  (2, 1909)	1
  (2, 130)	1
  (2, 292)	1
  (2, 1616)	1
  (2, 1722)	1
  (2, 1090)	1
  (2, 945)	1
  (3, 924)	1
  (4, 1082)	1
  (4, 1158)	1
  :	:
  (2540, 139)	1
  (2540, 557)	1
  (2540, 266)	1
  (2540, 402)	1
  (2540, 889)	1
  (2540, 1913)	1
  (2540, 1291)	1
  (2540, 516)	1
  (2540, 1508)	1
  (2540, 500)	1
  (2541, 1969)	1
  (2541, 1863)	1
  (2541, 1464)	1
  (2541, 1625)	1
  (2542, 1969)	1
  (2542, 866)	1
  (2542, 210)	1
  (2542, 921)	1
  (2543, 949)	1
  (2544, 921)	1
  (2544, 273)	1
  (2544, 1972)	1
  (2544, 850)	1
  (2544, 511)	1
  (2544, 811)	1
  (0, 1909)	0.12483228708897758
  (0, 1882)	0.15958046626974076
  (0, 1809)	0.3762460056189036
  (0, 1798)	0.3401891003449353
  (0, 1764)	0.2848037864251699
  (0, 1234)	0.32268611122021496
  (0, 353)	0.30413219507096706
  (0, 346)	0.3068004270263

# Name

In [46]:
# stores user information
class User:
    def __init__(self) :
        self.storage = {}

    def add_name(self,userName):
        self.storage['name'] = userName
    
    def add_hometown(self,home):
        self.storage['hometown'] = home

    def add_company(self,company):
        self.storage['company'] = company
    
    def get_name(self):
        return self.storage['name']
    
    def get_hometown(self):
        return self.storage['hometown'] 
    
    def get_company(self):
        return self.storage['company']

        # self.name = userName
        # self.company = company
        # self.hometown = hometown
        # self.storage = {"username":self.name,"company":self.company,"hometwon":self.company}

In [47]:
from nltk import word_tokenize, pos_tag
import re
def identification(text):
    results = pos_tag(word_tokenize(text))
    info = []
    for x in results:
        if 'NNP' in x:    # Ends with 'NNP'
            info.append(x[0])
    info_text = " ".join(info)
    return info_text

In [19]:
user = User()
name_triggered = ["my name is","call me"]
hometown_triggered = ["my hometown is","I'm from"]
org_triggered = ["my company is","I'm working for"]
name_required = "What's my name?"
hometown_required = "Where am I from?"
org_required = "Which company am I with?"
GREETING_INPUTS=["Hello","hello","hi","greetings","sup","what's up","hey","hey!","How are you?","How are you doing?"]

def user_profile(text):

    for i in range(2):

        if name_triggered[i] in text or name_triggered[i] in text.lower():
            
            name = identification(text)
            #print(name)
            user.add_name(str(name))
            return "Nice to meet you. I'm your chatbot and you  can call me Teresa!"
        if hometown_triggered[i] in text or hometown_triggered[i] in text.lower():
            hometown = identification(text)
            user.add_hometown(hometown)
            return "Thank you for telling me."
        if  org_triggered[i] in text or org_triggered[i] in text.lower():
            company = identification(text)
            user.add_company(company)
            return "Glad to know about that."
        else:
            pass
    
    if text in name_required or text in name_required.lower():
        try:
            return user.get_name()
        except:
            return "Sorry, could you pleas tell me your name?"
    if text in hometown_required or text in hometown_required.lower():
        try:
            return user.get_hometown()
        except:
            return "Sorry, could you pleas tell me your hometown?"
    if (text in org_required or text ==  org_required.lower()) and text not in GREETING_INPUTS:
        try:
            return user.get_company()
        except:
            return "Sorry, could you pleas tell me your company?"
    else:
        pass

#user_profile("what's my name")
# user_profile("I'm working for GOOGLE")
# print(user_profile("Which company am I with?"))


'Sorry, could you pleas tell me your name?'

# Emotion

In [48]:
import os
from sklearn.model_selection import train_test_split

label_dir = { # 500//500
    "positive": "data/positive",
    "negative": "data/negative"
}

data = []
labels = []

for label in label_dir.keys():
    for file_name in os.listdir(label_dir[label]):
        filepath = f"{label_dir[label]}/{file_name}"
        with open(filepath, mode='r', encoding='utf8', errors='ignore') as f:
            content = f.read()
            data.append(content)
            labels.append(label)
            
X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels, test_size=0.25, random_state=42)

In [49]:
count_vect = CountVectorizer(lowercase=True, stop_words=stopwords.words('english'), analyzer=stemmed_words)
X_train_counts = count_vect.fit_transform(X_train)

#print(X_train_counts)

In [50]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=True).fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)


In [51]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train_tf, y_train)
X_new_counts = count_vect.transform(X_test)

# Weighting
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [52]:
# offering a joke
import json
import random

def shuffle_jokes():
    """

    Retrieve the given json file containing the joke and write the joke to the joke_List. 
    joke_list = ["joke1"， “joke2”， “joke3” ....]
    
    """
    joke_list = []
    
   
    with open("reddit_jokes.json") as f:
        all_jokes = json.load(f)
        for joke in all_jokes:
            tell_joke = joke['title'] + "\n" +joke["body"]
            joke_list.append(tell_joke)

    randnum = random.randint(0,194999)
    return joke_list[randnum]

#print(shuffle_jokes())

So a young man comes to his first ever Karate lesson
He steps through the doors of the dojo and sees three groups being taught moves by an instructor

He is directed to the first line where one of the Sensei's is teaching them how to block a hit

The man quickly learns the move and advances to the second group, proud of his achievement

The second line is taught one by one to perform a simple throw, but the man struggles as he has always lacked upper body strength

After many tries he finally succeeds but he decides karate is just not for him.

The young man turns around and walks towards the door, however on his way out the Sensei calls out his name and says:

"Hey, didn't you forget the punch line?"


In [53]:
def predicted_emotions(new_data):
    
    processed_newdata = count_vect.transform(new_data)

    # Weighting
    processed_newdata = tfidf_transformer.transform(processed_newdata)
    emotion_state = ''.join(clf.predict(processed_newdata))
    return emotion_state


# Bot

In [70]:
GREETING_INPUTS=["Hello","hello","hi","greetings","sup","what's up","hey","hey!","How are you?","How are you doing?"]

while True:
    userInput = input(">>>")
    botOutput = user_profile(userInput)
    if chat_tfidf(userInput) == 'Bye.':
        print('Chatbot: Bye!')
        break
    #first phase:check the intent
    elif userInput in GREETING_INPUTS or userInput.lower() in GREETING_INPUTS or botOutput != None:
        intent = 'small talk'
    else:
        intent_input = intent_count_vect.transform([userInput])
        intent_input_proceed= intent_tfidf_transformer.transform(intent_input)
        intent = ''.join(intent_clf.predict(intent_input_proceed))
    #print(intent)

    #small talk or question answring
    if intent == 'small talk':
        
        if botOutput != None:
            print("Chatbot: " +  user_profile(userInput))
        else:
            print("Chatbot: "+chat_tfidf(userInput))  #just small talks
            
            emo = predicted_emotions([userInput])
            if  emo == 'negative':
                print("Chatbot:I think you are unhappy.Do you need me to tell you a joke?")
                while True:
                    instant_input = [input(">>>")]
                    if 'yes' in instant_input:
                        print(shuffle_jokes())
                        break
                    else:
                        print("Chatbot:Okay,hope you have a good day!")
                        break
    else:
        try:
            answer = topresults_invidx(userInput)
            if len(answer) == 0:
                print("Chatbot: I can't do anything for now but my developer will implement more NLP in the future")
            else:
                print("Chatbot: " + ''.join(answer))
        except:
            print("Sorry, I cannot totally understand. Could you please change your expression?")

        

>>>hye!
Chatbot: Just think of me as the ace up your sleeve.
>>>hello
Chatbot: Howdy.
>>>hope you have a good day
Chatbot: Lovely, thanks.
>>>call me Wang
Chatbot: Nice to meet you. I'm your chatbot and you  can call me Teresa!
>>>I'm working for GOOGLE
Chatbot: Glad to know about that.
>>>My hometown is BeiJing
Chatbot: Thank you for telling me.
>>>what's my name
Chatbot: Wang
>>>which company am I with
Chatbot: Because medical research and development of drugs to treat such diseases is financially disadvantageous
>>>which company am I with
Chatbot: Because medical research and development of drugs to treat such diseases is financially disadvantageous
>>>Which company am I with
Chatbot: GOOGLE
>>>where am I from
Chatbot: I wish I knew where.
>>>Where am I from
Chatbot: BeiJing
>>>python is a good language
Chatbot: Great! Glad to hear it.
>>>today is awful
Chatbot: I'm sorry. Please let me know if I can help in some way.
Chatbot:I think you are unhappy.Do you need me to tell you a joke