In [None]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [None]:
df=pd.read_excel('dialog_talk_agent.xlsx')
df.head(20)

In [None]:
df.shape[0] # returns the number of rows in dataset

In [None]:
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.
df.head()

In [None]:
df1=df.head(10) # copy of first ten rows of dataset

In [None]:
# function that converts text into lower case and removes special characters

def step1(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [None]:
step1(df1['Context'])

In [None]:
 # word tokenizing
    
s='tell me about your personality'
words=word_tokenize(s)
print(words)

In [None]:
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
lemma.lemmatize('absorbed', pos = 'v')

In [None]:
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

In [None]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [None]:
text_normalization('telling you some stuff about me')

In [None]:
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)

In [None]:
# all the stop words we have 

stop = stopwords.words('english')
print(stop)

# bag of words

In [None]:
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [None]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

In [None]:
Question ='Will you help me and tell me about yourself more' # considering an example query

In [None]:
# checking for stop words

Q=[]
a=Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b=" ".join(Q) 

In [None]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [None]:
text_normalization

In [None]:
Question_bow

# similarity

In [None]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

In [None]:
df['similarity_bow']=cosine_value # creating a new column 

In [None]:
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
df_simi.head()

In [None]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()

In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
df_threshold.head()

- Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we got using bow and the smiliarity value of responses, we consider the response with highest similarity

In [None]:
index_value = cosine_value.argmax() # returns the index number of highest value
index_value 

In [None]:
(Question)

In [None]:
df['Text Response'].loc[index_value] # The text at the above index becomes the response for the question

# tf-idf

In [None]:
Question1 ='Tell me about yourself.'

In [None]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf

In [None]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

In [None]:
# returns all the unique word from data with a score of that word

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

# similarity

In [None]:
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')  # applying cosine similarity
cos

In [None]:
df['similarity_tfidf']=cos # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf.head()

In [None]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)

In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold.head()

- by using tfidf for the question 'Will you help me and tell me about yourself more' , the above are the responses we got and the smiliarity value of responses, we consider the response with highest similarity

In [None]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

In [None]:
Question1

In [None]:
df['Text Response'].loc[index_value1]  # returns the text at that index

# Model Using Bag of Words

In [None]:
# Function that removes stop words and process the text

def stopword_(text):   
    tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
    stop=stopwords.words('english')
    lema=wordnet.WordNetLemmatizer()
    lema_word=[]
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word) 

In [None]:
# defining a function that returns response to query using bow

def chat_bow(text):
    s=stopword_(text)
    lemma=text_normalization(s) # calling the function to perform text normalization
    bow=cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value=cosine_value.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [None]:
chat_bow('hi there')

In [None]:
chat_bow('Your are amazing')

In [None]:
chat_bow('i miss you')

# Model Using tf-idf

In [None]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [None]:
chat_tfidf('hi')

In [None]:
chat_tfidf('how are you')

In [None]:
chat_tfidf('i love you')

In [None]:
chat_tfidf('thanks for your support!')

In [None]:
chat_tfidf('will you reply accurately?')

In [None]:
chat_tfidf('will you marry me?')

In [None]:
chat_tfidf('i miss you and i love you')

In [None]:
chat_tfidf('ask sravya to read')

In [None]:
chat_tfidf('you are amazing and hope to see u soon.')

# Conclusion
- Our chat bot worked well with both bow and tf-idf, tf-idf model worked well even with stop words compared to bow where we had to remove stop words before using bag of words.

In [55]:
from nltk import bigrams

my_list = ['Diamond', 'is hard', 'has high strength']

def convert_to_bigrams(my_list):
    final_list = []
    # save 1st word as a base
    base_word = my_list[0]
    # convert list to bigrams dict
    bigrams_dict =  dict(bigrams(my_list))
    # obtain dict's values
    for value in bigrams_dict.values():
        # join base_word and each dict's value
        final_list.append(base_word + " " + value)
    
    return final_list

# invoke function
convert_to_bigrams(my_list)

['Diamond is hard', 'Diamond has high strength']

['Diamond is hard', 'Diamond has high strength']

In [57]:
my_list_2 = ["Dog", "is a mammal", "has 4 legs", "is barking"]

convert_to_bigrams(my_list_2)

['Dog is a mammal', 'Dog has 4 legs', 'Dog is barking']

['Dog is a mammal', 'Dog has 4 legs', 'Dog is barking']