 # **ChatBot From Scratch**

## Import necessary libraries

In [1]:
import io
import random
import string
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Our workflow

*   Read Chatbot corpus (Wikipedia About Breast Cancer ) + Lowering
*   Tokenization
*   Remove Stop words
*   Preprocessing (Lemmatization)
    * Lemmatization
    



## **Read Corpus**

We will use a corpus from Wikipedia page about Breast Cancer

In [2]:
f = open('/content/Breast_cancer_2nd_corpus.txt', 'r', errors = 'ignore')
raw = f.read()
raw = raw.lower()

In [3]:
import re

raw = re.sub(r'\[\d+\]', '', raw)


# Print the cleaned corpus
print(raw[:500])

breast cancer is a cancer that develops from breast tissue. signs of breast cancer may include a lump in the breast, a change in breast shape, dimpling of the skin, milk rejection, fluid coming from the nipple, a newly inverted nipple, or a red or scaly patch of skin. in those with distant spread of the disease, there may be bone pain, swollen lymph nodes, shortness of breath, or yellow skin.

risk factors for developing breast cancer include obesity, a lack of physical exercise, alcohol consump


In [4]:
def save_text_to_file(text, filename="output.txt"):
    with open(filename, "w") as file:
        file.write(text)

save_text_to_file(raw, filename="output.txt")

In [5]:
!pip install nltk



In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

#-----------------------------

nltk.download('popular', quiet=True) # for downloading packages
nltk.download('punkt_tab', quiet=True) # first-time use only
nltk.download('wordnet', quiet=True) # first-time use only
nltk.download('stopwords', quiet=True)



True

## **Tokenization**

In [7]:
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [8]:
sent_tokens[:2]

['breast cancer is a cancer that develops from breast tissue.',
 'signs of breast cancer may include a lump in the breast, a change in breast shape, dimpling of the skin, milk rejection, fluid coming from the nipple, a newly inverted nipple, or a red or scaly patch of skin.']

In [9]:
word_tokens[:2]


['breast', 'cancer']

## **Preprocessing**

In [10]:
lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens if token not in stop_words]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(word_tokenize(text.lower().translate(remove_punct_dict)))


## **Keyword Matching**

In [11]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):

    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

## **Word2Vec Model Training**

In [12]:
# Step 1: Train Word2Vec
processed_corpus = [LemNormalize(sent) for sent in sent_tokens]  # Preprocess each sentence
word2vec_model = Word2Vec(sentences=processed_corpus, vector_size=200, window=25 , min_count=1, workers=4)

In [13]:
print(processed_corpus)

[['breast', 'cancer', 'cancer', 'develops', 'breast', 'tissue'], ['sign', 'breast', 'cancer', 'may', 'include', 'lump', 'breast', 'change', 'breast', 'shape', 'dimpling', 'skin', 'milk', 'rejection', 'fluid', 'coming', 'nipple', 'newly', 'inverted', 'nipple', 'red', 'scaly', 'patch', 'skin'], ['distant', 'spread', 'disease', 'may', 'bone', 'pain', 'swollen', 'lymph', 'node', 'shortness', 'breath', 'yellow', 'skin'], ['risk', 'factor', 'developing', 'breast', 'cancer', 'include', 'obesity', 'lack', 'physical', 'exercise', 'alcohol', 'consumption', 'hormone', 'replacement', 'therapy', 'menopause', 'ionizing', 'radiation', 'early', 'age', 'first', 'menstruation', 'child', 'late', 'life', 'older', 'age', 'prior', 'history', 'breast', 'cancer', 'family', 'history', 'breast', 'cancer'], ['five', 'ten', 'percent', 'case', 'result', 'inherited', 'genetic', 'predisposition', 'including', 'brca', 'mutation', 'among', 'others'], ['breast', 'cancer', 'commonly', 'develops', 'cell', 'lining', 'milk

In [16]:
# Helper to get the embedding of a sentence
def get_sentence_embedding(sentence, model):
    words = LemNormalize(sentence)
    embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return embedding if isinstance(embedding, np.ndarray) else np.zeros(model.vector_size)

# Step 2: Compute embeddings for all sentences in the corpus
corpus_embeddings = [get_sentence_embedding(sent, word2vec_model) for sent in sent_tokens]


## **Generate Responses**

In [17]:
## First Approuch


# Re-tokenize the corpus by paragraphs
para_tokens = raw.split('\n\n')  # Assuming paragraphs are separated by double newlines
corpus_embeddings_1st = [get_sentence_embedding(para, word2vec_model) for para in para_tokens]

def response(user_response):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings_1st)
    idx = np.argmax(similarities)

    # Check similarity threshold
    if similarities[0][idx] < 0.2:  # Adjust this threshold if needed
        chatbot_response = "I am sorry! I don't understand you."
    else:
        chatbot_response = para_tokens[idx]  # Return the most similar paragraph

    return chatbot_response


In [19]:
## Second approatch


def response(user_response):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)
    idx = np.argmax(similarities)

    # Check similarity threshold
    if similarities[0][idx] < 0.2:  # Adjust this threshold if needed
        chatbot_response = "I am sorry! I don't understand you."
    else:
        # Include the neighboring sentences for context
        start_idx = max(0, idx - 1)
        end_idx = min(len(sent_tokens), idx + 2)  # Adjust to include more or fewer sentences
        chatbot_response = " ".join(sent_tokens[start_idx:end_idx])

    return chatbot_response


In [21]:
## Third approach
def response(user_response, top_k=3):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)[0]

    # Get the indices of the top-k similar sentences
    top_k_indices = similarities.argsort()[-top_k:][::-1]

    # Construct the response from top-k similar sentences
    chatbot_response = " ".join([sent_tokens[idx] for idx in top_k_indices if similarities[idx] > 0.2])  # Adjust threshold if needed

    if not chatbot_response:
        chatbot_response = "I am sorry! I don't understand you."

    return chatbot_response

In [22]:
# Chat Loop
if __name__ == "__main__":
    flag = True
    print("Hello, there my name is docbot !. I will answer your queries. If you want to exit, type Bye!")
    while flag:
        user_response = input("You: ").lower()
        if user_response != 'bye':
            if user_response == 'thanks' or user_response == 'thank you':
                flag = False
                print("Docbot: You're welcome!")
            else:
                if greeting(user_response) is not None:
                    print("Docbot:", greeting(user_response))
                else:
                    print("Docbot:", response(user_response))
        else:
            flag = False
            print("Docbot: Bye! Have a great time!")

Hello, there my name is Aneka. I will answer your queries. If you want to exit, type Bye!
You: hey
Aneka: hi
You: hii
Aneka: I am sorry! I don't understand you.
You: what is breast cancer ?
Aneka: breast cancer is a cancer that develops from breast tissue. health disparities in breast cancer
there are ethnic disparities in the mortality rates for breast cancer as well as in breast cancer treatment. up to 5% of people with breast cancer have inflammatory breast cancer, where cancer cells block the lymph vessels of one breast, causing the breast to substantially swell and redden over three to six months.
You: what is pink ribbon ?
Aneka: pink ribbon
a pink ribbon is the most prominent symbol of breast cancer awareness. : 366–368  critics say that the feel-good nature of pink ribbons and pink consumption distracts society from the lack of progress on preventing and curing breast cancer. it has also been criticized as hypocrisy, because some people wear the pink ribbon to show good will to

KeyboardInterrupt: Interrupted by user

# First
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer. breast cancer can begin in different parts of the breast, including the ducts, lobules, or in some cases, the tissue in between. while it primarily affects women, men can also develop breast cancer.


# Second
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer.



# Third
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer. breast cancer most commonly develops in cells from the lining of milk ducts and the lobules that supply these ducts with milk.


 # Using TF/IDF

In [23]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import string

# Load and preprocess document
#raw_document = """Your single document text goes here..."""  # Load your document text here
sent_tokens = nltk.sent_tokenize(raw)  # Split document into sentences or paragraphs

# Train Word2Vec on the document sentences
processed_corpus = [LemNormalize(sent) for sent in sent_tokens]
word2vec_model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Generate TF-IDF weights
tfidf_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf_vectorizer.fit(sent_tokens)  # Treats each sentence or paragraph as a "document" for weighting
tfidf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Helper to get weighted embeddings
def get_weighted_embedding(sentence, model, tfidf_weights):
    words = LemNormalize(sentence)
    embeddings = [
        model.wv[word] * tfidf_weights.get(word, 1.0) for word in words if word in model.wv
    ]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Compute embeddings for each sentence/paragraph in the document
corpus_embeddings = [get_weighted_embedding(sent, word2vec_model, tfidf_weights) for sent in sent_tokens]

# Generate a response using weighted similarity
def response(user_response):
    chatbot_response = ''
    user_embedding = get_weighted_embedding(user_response, word2vec_model, tfidf_weights)

    # Calculate cosine similarity between user embedding and corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)
    idx = np.argmax(similarities)

    # Check similarity threshold to filter low-confidence responses
    if similarities[0][idx] < 0.3:  # Adjust threshold if needed
        chatbot_response = "I’m sorry! I don’t understand you."
    else:
        chatbot_response = sent_tokens[idx]  # Return the most similar sentence/paragraph

    return chatbot_response

# Example interaction loop
if __name__ == "__main__":
    print("Hello! I’m here to help. Type 'bye' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'bye':
            print("Chatbot: Goodbye!")
            break
        print("Chatbot:", response(user_input))


Hello! I’m here to help. Type 'bye' to exit.
You: hi
Chatbot: I’m sorry! I don’t understand you.
You: what is breast cancer ?
Chatbot: those with lobular carcinoma in situ also have an increased risk of developing breast cancer – around 1% develop breast cancer each year.
You: pink ribbon 
Chatbot: pink ribbon
a pink ribbon is the most prominent symbol of breast cancer awareness.


KeyboardInterrupt: Interrupted by user