In [None]:
import pandas as pd
import nltk
import numpy as np
import re
# lemmatization
from nltk.stem import wordnet
# bag of words(BoW)
from sklearn.feature_extraction.text import CountVectorizer
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# PoS(Part of Speech)
from nltk import pos_tag
# similarity using pairwise distances
from sklearn.metrics import pairwise_distances
# Tokenization
from nltk import word_tokenize
from nltk.corpus import stopwords

In [None]:
nltk.download('popular')

In [None]:
df = pd.read_excel("https://github.com/ammishra08/MachineLearning/raw/master/Datasets/dialog_agent.xlsx")

In [None]:
df.head()

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,


In [None]:
# It fills the null value with the previous value
df.ffill(axis = 0, inplace = True)

In [None]:
df.head(20)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,I can help you work smarter instead of harder
4,tell me about yourself,I can help you work smarter instead of harder
5,all about you,I can help you work smarter instead of harder
6,tell me some stuff about you,I can help you work smarter instead of harder
7,talk some stuff about you,I can help you work smarter instead of harder
8,talk about yourself,I can help you work smarter instead of harder
9,about yourself,I can help you work smarter instead of harder


In [None]:
# Function to convert texts into lowercase & removes special characters
def step1 (x):
  for i in x:
    a = str(i).lower()
    p = re.sub(r'[^a-z0-9]', ' ', a)
    print(p)

In [None]:
step1(df['Context'].head(10))

tell me about your personality
i want to know you better
define yourself
describe yourself
tell me about yourself
all about you
tell me some stuff about you
talk some stuff about you
talk about yourself
about yourself


In [None]:
# Text Normalization
def text_normalization(text):
  text = str(text).lower()
  spl_char_text = re.sub(r'[^ a-z]', '', text)
  tokens = word_tokenize(spl_char_text)
  lema = wordnet.WordNetLemmatizer()
  tags_list = pos_tag(tokens, tagset=None)

  lema_words = []
  for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
  return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [None]:
df['Lemmatized Text'] = df['Context'].apply(text_normalization)
df

Unnamed: 0,Context,Text Response,Lemmatized Text
0,Tell me about your personality,Just think of me as the ace up your sleeve.,tell me about your personality
1,I want to know you better,I can help you work smarter instead of harder,i want to know you good
2,Define yourself,I can help you work smarter instead of harder,define yourself
3,Describe yourself,I can help you work smarter instead of harder,describe yourself
4,tell me about yourself,I can help you work smarter instead of harder,tell me about yourself
...,...,...,...
1587,can we chat,Talking is what I do best.,can we chat
1588,I'll be back in a few minutes,I'll be waiting.,ill be back in a few minute
1589,I'll be back,All right. I'll be here.,ill be back
1590,I'll get back to you in a moment,Till next time.,ill get back to you in a moment


##### Bag of Words

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(df['Lemmatized Text']).toarray()

In [None]:
X

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
features = cv.get_feature_names_out()
df_bow = pd.DataFrame(X, columns = features)
df_bow

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1589,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1590,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


##### Similarity

In [None]:
Question = "Will you help me and tell me about yourself more"

In [None]:
stop = stopwords.words('english')
Q = []
a = Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b = " ".join(Q)

In [None]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [None]:
# Sparse Matrix
Question_bow

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# cosine similarity for the above question we considered.
# (0,1) - High cosine value will be high similarity
cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

array([[0.25819889],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [None]:
df['similarity_bow'] = cosine_value

In [None]:
df_simi = pd.DataFrame(df, columns=['Text Response', 'similarity_bow'])
df_simi

Unnamed: 0,Text Response,similarity_bow
0,Just think of me as the ace up your sleeve.,0.258199
1,I can help you work smarter instead of harder,0.000000
2,I can help you work smarter instead of harder,0.000000
3,I can help you work smarter instead of harder,0.000000
4,I can help you work smarter instead of harder,0.288675
...,...,...
1587,Talking is what I do best.,0.000000
1588,I'll be waiting.,0.000000
1589,All right. I'll be here.,0.000000
1590,Till next time.,0.000000


In [None]:
df_simi_sort = df_simi.sort_values(by = 'similarity_bow', ascending = False)
df_simi_sort.head()

Unnamed: 0,Text Response,similarity_bow
211,I'm glad to help. What can I do for you?,0.57735
194,I'm glad to help. What can I do for you?,0.57735
184,I'm glad to help. What can I do for you?,0.408248
186,I'm glad to help. What can I do for you?,0.408248
200,I'm glad to help. What can I do for you?,0.408248


In [None]:
df_simi_sort[df_simi_sort['similarity_bow'] > 0.2]

Unnamed: 0,Text Response,similarity_bow
211,I'm glad to help. What can I do for you?,0.57735
194,I'm glad to help. What can I do for you?,0.57735
184,I'm glad to help. What can I do for you?,0.408248
186,I'm glad to help. What can I do for you?,0.408248
200,I'm glad to help. What can I do for you?,0.408248
219,I'm glad to help. What can I do for you?,0.333333
728,It's my pleasure to help.,0.333333
188,I'm glad to help. What can I do for you?,0.333333
190,I'm glad to help. What can I do for you?,0.333333
191,I'm glad to help. What can I do for you?,0.333333


In [None]:
# Term Frequency & IDF Values are generated to create DataFrame
tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(df['Lemmatized Text']).toarray()

In [None]:
df_tfidf = pd.DataFrame(x_tfidf, columns=tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0.0,0.407572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.330555,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.218768,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64179,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64179,0.0
4,0.0,0.45379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608937,0.0


In [None]:
# defining a function that returns response to query using tf-idf
def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value
    return df['Text Response'].loc[index_value]

In [None]:
chat_tfidf("Hi")

'Hey!'

In [None]:
chat_tfidf("how are you?")

'Lovely, thanks.'

In [None]:
chat_tfidf("I really like you")

'Thanks! The feeling is mutual.'

In [None]:
chat_tfidf("Who are you?")

'I can help you work smarter instead of harder'

In [None]:
chat_tfidf("play a song for me")

'Very funny, boss.'