In [None]:
import pandas as pd
import nltk 
import re
import numpy as np
from tkinter import *

from nltk.stem import wordnet                                  # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer    # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer    # to perform tfidf
from nltk import pos_tag                                       # for parts of speech
from sklearn.metrics import pairwise_distances                 # to perfrom cosine similarity
from nltk import word_tokenize                                 # to create tokens
from nltk.corpus import stopwords                              # for stop words

In [None]:
df = pd.read_csv("MHData.csv", nrows = 20)#setting df as the data MHData
df.head()#showing Df

In [None]:
df.isnull().sum() #finding empty values 

In [None]:
nltk.download('punkt')                   
   
s = 'please tell me about your personality'
words = word_tokenize(s)                    # tokenize words #seperating words based on spaces 
print(words)

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')                  
lemma = wordnet.WordNetLemmatizer()         
lemma.lemmatize('absorbed', pos = 'v')        # lemmatize words

In [None]:
 nltk.download('averaged_perceptron_tagger')     
pos_tag(nltk.word_tokenize(s),tagset = None)       # returns the parts of speech of every word

In [None]:
 nltk.download('stopwords')           

stop = stopwords.words('english')
print(stop)

In [None]:
nltk.download('wordnet')

In [None]:
# function that performs text normalization steps and returns the lemmatized tokens as a sentence

def text_normalization(text):
    text = str(text).lower()                        # text to lower case
    spl_char_text = re.sub(r'[^ a-z]','',text)      # removing special characters
    tokens = nltk.word_tokenize(spl_char_text)      # word tokenizing
    lema = wordnet.WordNetLemmatizer()              # intializing lemmatization
    tags_list = pos_tag(tokens,tagset=None)         # parts of speech
    lema_words = []                                 # empty list 
    for token,pos_token in tags_list:               # lemmatize according to POS
        if pos_token.startswith('V'):               # Verb
            pos_val = 'v'
        elif pos_token.startswith('J'):             # Adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):             # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n'                           # Noun
        lema_token = lema.lemmatize(token,pos_val)

        if lema_token in stop: 
          lema_words.append(lema_token)             # appending the lemmatized token into a list
    
    return " ".join(lema_words) 

In [None]:
nltk.download('omw-1.4')

In [None]:
text_normalization('telling you some stuffs about me')  # example

In [None]:
df['lemmatized_text'] = df['Questions'].apply(text_normalization)   # clean text
df.head(5)

In [None]:
cv = CountVectorizer()                                  # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [None]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

In [None]:
Question = 'What treatment options are available'                           # example

In [None]:
Question_lemma = text_normalization(Question)                               # clean text
Question_bow = cv.transform([Question_lemma]).toarray()                     # applying bow

In [None]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

In [None]:
df['similarity_bow'] = cosine_value                                         # create cosine value as a new column

In [None]:
simiscores = pd.DataFrame(df, columns=['Answers','similarity_bow'])         # taking similarity value of responses for the question we took
simiscores

In [None]:
simscoresDescending = simiscores.sort_values(by = 'similarity_bow', ascending=False)          # sorting the values
simscoresDescending.head()

In [None]:
threshold = 0.1                                                                         # considering the value of smiliarity to be greater than 0.1
df_threshold = simscoresDescending[simscoresDescending['similarity_bow'] > threshold] 
df_threshold

In [None]:
index_value = cosine_value.argmax()         # index number of highest value
index_value

In [None]:
df['Answers'].loc[index_value]              # The text at the above index becomes the response for the question

In [None]:
Question1 = 'What treatment options are available'

In [None]:
# using tf-idf

tfidf = TfidfVectorizer()                                             # intializing tf-id 
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()        # transforming the data into array

In [None]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray()         # applying tf-idf

In [None]:
# returns all the unique word from data with a score of that word

df_tfidf = pd.DataFrame(x_tfidf,columns = tfidf.get_feature_names()) 
df_tfidf.head()

In [None]:
cos = 1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')                     # applying cosine similarity
cos

In [None]:
df['similarity_tfidf'] = cos                                                    # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Answers','similarity_tfidf'])        # taking similarity value of responses for the question we took
df_simi_tfidf

In [None]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False)            # sorting the values
df_simi_tfidf_sort.head(10)

In [None]:
threshold = 0.1                                                                                   # considering the value of smiliarity to be greater than 0.1
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

In [None]:
threshold = 0.1                                                                                   # considering the value of smiliarity to be greater than 0.1
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

In [None]:
df['Answers'].loc[index_value]                                               # returns the text at that index

In [None]:
# defining a function that returns response to query using bow

def chat_bow(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    bow = cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value = cosine_value.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [None]:
chat_bow('can you prevent mental health problems')

In [None]:
chat_bow('what is mental health')

In [None]:
chat_bow('are there cures for mental health problems')

In [None]:
chat_bow('how do I know if i am unwell')

In [None]:
chat_bow('what do you mean by mental health')

In [None]:
# defining a function that returns response to query using tf-idf

def chatbot(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    tf = tfidf.transform([lemma]).toarray() # applying tf-idf
    cos = 1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value = cos.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [None]:
chatbot('i am feeling sad')

In [None]:
chatbot('how to know if i am depresed')

In [None]:
chatbot('i am feeling unwell')


chatbot('how to know if i am depresed')