 # **ChatBot From Scratch**

## Import necessary libraries

In [None]:
import io
import random
import string
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
#pip install nltk

## Our workflow

*   Read Chatbot corpus (Wikipedia About Breast Cancer ) + Lowering
*   Tokenization
*   Remove Stop words
*   Preprocessing (Lemmatization)
    * Lemmatization
    



## **Read Corpus**

We will use a corpus from Wikipedia page about Breast Cancer

In [None]:
f = open('/content/Breast Cancer QnA.txt', 'r', errors = 'ignore')
raw = f.read()
raw = raw.lower()

In [None]:
import re

raw = re.sub(r'\[\d+\]', '', raw)


# Print the cleaned corpus
print(raw[:500])

q1: what is breast cancer?
a1: breast cancer is a disease in which cells in the breast grow uncontrollably. it can occur in the ducts, lobules, or
other parts of the breast tissue.
q2: what are the main types of breast cancer?
a2: the main types include invasive ductal carcinoma, invasive lobular carcinoma, ductal carcinoma in situ (dcis),
and triple-negative breast cancer.
q3: what causes breast cancer?
a3: the exact cause is unknown, but factors like genetic mutations, hormonal changes, and en


In [None]:
def save_text_to_file(text, filename="output.txt"):
    with open(filename, "w") as file:
        file.write(text)

save_text_to_file(raw, filename="output.txt")

In [None]:
!pip install nltk



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

#-----------------------------

nltk.download('popular', quiet=True) # for downloading packages
nltk.download('punkt', quiet=True) # first-time use only
nltk.download('punkt_tab', quiet=True) # first-time use only
nltk.download('wordnet', quiet=True) # first-time use only
nltk.download('stopwords', quiet=True)



True

In [None]:
import nltk
nltk.download('punkt', force=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Tokenization**

In [None]:
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [None]:
sent_tokens[:2]

In [None]:
word_tokens[:2]


## **Preprocessing**

In [None]:
lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens if token not in stop_words]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(word_tokenize(text.lower().translate(remove_punct_dict)))


## **Keyword Matching**

In [None]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):

    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

## **Word2Vec Model Training**

In [None]:
processed_corpus = [LemNormalize(sent) for sent in sent_tokens]  # Preprocess each sentence
word2vec_model = Word2Vec(sentences=processed_corpus, vector_size=200, window=25 , min_count=1, workers=4)

In [None]:
# Helper to get the embedding of a sentence
def get_sentence_embedding(sentence, model):
    words = LemNormalize(sentence)
    embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return embedding if isinstance(embedding, np.ndarray) else np.zeros(model.vector_size)

# Compute embeddings for all sentences in the corpus
corpus_embeddings = [get_sentence_embedding(sent, word2vec_model) for sent in sent_tokens]


## **Generate Responses**

In [None]:
## First Approuch


# Re-tokenize the corpus by paragraphs
para_tokens = raw.split('\n\n')  # Assuming paragraphs are separated by double newlines
corpus_embeddings = [get_sentence_embedding(para, word2vec_model) for para in para_tokens]

def response(user_response):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)
    idx = np.argmax(similarities)

    # Check similarity threshold
    if similarities[0][idx] < 0.2:  # Adjust this threshold if needed
        chatbot_response = "I am sorry! I don't understand you."
    else:
        chatbot_response = para_tokens[idx]  # Return the most similar paragraph

    return chatbot_response


In [None]:
## Second approatch


def response(user_response):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)
    idx = np.argmax(similarities)

    # Check similarity threshold
    if similarities[0][idx] < 0.2:  # Adjust this threshold if needed
        chatbot_response = "I am sorry! I don't understand you."
    else:
        # Include the neighboring sentences for context
        start_idx = max(0, idx - 1)
        end_idx = min(len(sent_tokens), idx + 2)  # Adjust to include more or fewer sentences
        chatbot_response = " ".join(sent_tokens[start_idx:end_idx])

    return chatbot_response


In [None]:
## Third approach
def response(user_response, top_k=3):
    chatbot_response = ''
    user_embedding = get_sentence_embedding(user_response, word2vec_model)

    # Calculate cosine similarity with corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)[0]

    # Get the indices of the top-k similar sentences
    top_k_indices = similarities.argsort()[-top_k:][::-1]

    # Construct the response from top-k similar sentences
    chatbot_response = " ".join([sent_tokens[idx] for idx in top_k_indices if similarities[idx] > 0.2])  # Adjust threshold if needed

    if not chatbot_response:
        chatbot_response = "I am sorry! I don't understand you."

    return chatbot_response

In [None]:
# Chat Loop
if __name__ == "__main__":
    flag = True
    print("Hello, there my name is Aneka. I will answer your queries. If you want to exit, type Bye!")
    while flag:
        user_response = input("You: ").lower()
        if user_response != 'bye':
            if user_response == 'thanks' or user_response == 'thank you':
                flag = False
                print("Aneka: You're welcome!")
            else:
                if greeting(user_response) is not None:
                    print("Aneka:", greeting(user_response))
                else:
                    print("Aneka:", response(user_response))
        else:
            flag = False
            print("Aneka: Bye! Have a great time!")

# First
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer. breast cancer can begin in different parts of the breast, including the ducts, lobules, or in some cases, the tissue in between. while it primarily affects women, men can also develop breast cancer.


# Second
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer.



# Third
Aneka: breast cancer is a disease in which cells in the breast grow out of control. there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer. breast cancer most commonly develops in cells from the lining of milk ducts and the lobules that supply these ducts with milk.


In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import string

# Load and preprocess document
#raw_document = """Your single document text goes here..."""  # Load your document text here
sent_tokens = nltk.sent_tokenize(raw)  # Split document into sentences or paragraphs

# Train Word2Vec on the document sentences
processed_corpus = [LemNormalize(sent) for sent in sent_tokens]
word2vec_model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Generate TF-IDF weights
tfidf_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf_vectorizer.fit(sent_tokens)  # Treats each sentence or paragraph as a "document" for weighting
tfidf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Helper to get weighted embeddings
def get_weighted_embedding(sentence, model, tfidf_weights):
    words = LemNormalize(sentence)
    embeddings = [
        model.wv[word] * tfidf_weights.get(word, 1.0) for word in words if word in model.wv
    ]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Compute embeddings for each sentence/paragraph in the document
corpus_embeddings = [get_weighted_embedding(sent, word2vec_model, tfidf_weights) for sent in sent_tokens]

# Generate a response using weighted similarity
def response(user_response):
    chatbot_response = ''
    user_embedding = get_weighted_embedding(user_response, word2vec_model, tfidf_weights)

    # Calculate cosine similarity between user embedding and corpus embeddings
    similarities = cosine_similarity([user_embedding], corpus_embeddings)
    idx = np.argmax(similarities)

    # Check similarity threshold to filter low-confidence responses
    if similarities[0][idx] < 0.3:  # Adjust threshold if needed
        chatbot_response = "I’m sorry! I don’t understand you."
    else:
        chatbot_response = sent_tokens[idx]  # Return the most similar sentence/paragraph

    return chatbot_response

# Example interaction loop
if __name__ == "__main__":
    print("Hello! I’m here to help. Type 'bye' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'bye':
            print("Chatbot: Goodbye!")
            break
        print("Chatbot:", response(user_input))


Hello! I’m here to help. Type 'bye' to exit.
You: hello
Chatbot: I’m sorry! I don’t understand you.
You: what are types of breast cancer?
Chatbot: there are different types of breast cancer, and the type depends on which cells in the breast turn into cancer.
You: what are they?
Chatbot: I’m sorry! I don’t understand you.
You: what is the pink ribbon?
Chatbot: pink ribbon
a pink ribbon is the most prominent symbol of breast cancer awareness.


KeyboardInterrupt: Interrupted by user