In [None]:
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
import nltk

INDEX_FILE = "tfidf_index.pkl"

def build_index():
    """Loads 20 Newsgroups dataset and builds a TF-IDF index."""
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    doc_names = [f"Doc {i}" for i in range(len(newsgroups.data))]
    doc_texts = newsgroups.data

    custom_docs = {
        "router_issues": "Error: 502.B\nThe blue cable is not connected properly. Try:\n- Checking if the cable is securely plugged in.\n- Restarting your router.\n- Using a different cable.",
        "internet_slow": "Problem: Slow Internet\nPossible solutions:\n- Check for high bandwidth usage in your home.\n- Restart your modem and router.\n- Contact your ISP if the issue persists."
    }
    
    doc_names.extend(custom_docs.keys())
    doc_texts.extend(custom_docs.values())

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    index = {
        "vectorizer": vectorizer,
        "tfidf_matrix": tfidf_matrix,
        "doc_names": doc_names,
        "documents": {name: text for name, text in zip(doc_names, doc_texts)}
    }
    
    with open(INDEX_FILE, "wb") as f:
        pickle.dump(index, f)
    print("TF-IDF index built and saved.")

def load_index():
    """Loads the TF-IDF index from disk."""
    if not os.path.exists(INDEX_FILE):
        print("No index found. Building new index...")
        build_index()
    with open(INDEX_FILE, "rb") as f:
        return pickle.load(f)

def query_chatbot(query):
    """Searches for the most relevant document given a query."""
    index = load_index()
    vectorizer, tfidf_matrix, doc_names, documents = index.values()
    
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    best_match_idx = np.argmax(similarities)
    
    best_doc_name = doc_names[best_match_idx]
    best_doc_text = documents[best_doc_name]
    
    return f"Relevant Document: {best_doc_name}\n{best_doc_text[:500]}..."

In [None]:
print("Welcome to the Support Chatbot!")
user_input = input("You: ")
if user_input.lower() in ["exit", "quit"]:
    print("Goodbye!")
# user_input = "My router is not working, it is giving me error code 502.B, what can i do?"
response = query_chatbot(user_input)
print("Chatbot:", response)

Welcome to the Support Chatbot!
Chatbot: Relevant Document: Doc 7736


Your machine will run at whatever the bus is jumpered to/CMOS is set to
(usually wait states) regardless of what speed RAM is installed.  No
motherboard can sense the speed of the RAM installed, unless you call
failing as a sort of auto-sense.  This is how you can sometimes use
"slower" RAM in a machine.  You either set the number of wait states to
accomodate the slow RAM (in which case, all memory will run at that
slower rate) or you reduce the wait states and take the chance that the
slower ...


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

INDEX_FILE = "bm25_index.pkl"

def build_index():
    """Loads 20 Newsgroups dataset and builds a BM25 index."""
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    doc_names = [f"Doc {i}" for i in range(len(newsgroups.data))]
    doc_texts = newsgroups.data

    custom_docs = {
        "router_issues": "Error: 502.B\nThe blue cable is not connected properly. Try:\n- Checking if the cable is securely plugged in.\n- Restarting your router.\n- Using a different cable.",
        "internet_slow": "Problem: Slow Internet\nPossible solutions:\n- Check for high bandwidth usage in your home.\n- Restart your modem and router.\n- Contact your ISP if the issue persists."
    }
    
    doc_names.extend(custom_docs.keys())
    doc_texts.extend(custom_docs.values())
    
    tokenized_docs = [word_tokenize(doc.lower()) for doc in doc_texts]
    bm25 = BM25Okapi(tokenized_docs)
    
    index = {
        "bm25": bm25,
        "doc_names": doc_names,
        "tokenized_docs": tokenized_docs,
        "documents": {name: text for name, text in zip(doc_names, doc_texts)}
    }
    
    with open(INDEX_FILE, "wb") as f:
        pickle.dump(index, f)
    print("BM25 index built and saved.")

def load_index():
    """Loads the BM25 index from disk."""
    if not os.path.exists(INDEX_FILE):
        print("No index found. Building new index...")
        build_index()
    with open(INDEX_FILE, "rb") as f:
        return pickle.load(f)

def query_chatbot(query):
    """Searches for the most relevant document given a query using BM25."""
    index = load_index()
    bm25, doc_names, tokenized_docs, documents = index.values()
    
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)
    best_match_idx = np.argmax(scores)
    
    best_doc_name = doc_names[best_match_idx]
    best_doc_text = documents[best_doc_name]
    
    return f"Relevant Document: {best_doc_name}\n{best_doc_text[:500]}..."

user_input = input("You: ")
if user_input.lower() in ["exit", "quit"]:
    print("Goodbye!")
response = query_chatbot(user_input)
print("Chatbot:", response)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bartw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bartw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Chatbot: Relevant Document: Doc 8492
I am wanting to upgrade from a 386SX-25, to a 486DX-33, and are looking at a
cheap quote from someone offering me a 486DX-33 motherboard, with no ram in
it.  (I will probably sell my old m-board off somewhere)
Now, I have 4 meg of RAM in my 386, which consists of

4 x 9 module 1024KB simms, running at 70 nanoseconds.
    ^^^^^^^^                          ^^^^^^^^^^^^^^
Would I encounter problems with the pointed out areas, by throwing these from
one computer to the other?...


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
INDEX_FILE = "bm25_index.pkl"

def tokenize_text(text):
    """Tokenizes text using BERT tokenizer."""
    return tokenizer.tokenize(text.lower())

def build_index():
    """Loads 20 Newsgroups dataset and builds a BM25 index."""
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    doc_names = [f"Doc {i}" for i in range(len(newsgroups.data))]
    doc_texts = newsgroups.data

    custom_docs = {
        "router_issues": "Error: 502.B\nThe blue cable is not connected properly. Try:\n- Checking if the cable is securely plugged in.\n- Restarting your router.\n- Using a different cable.",
        "internet_slow": "Problem: Slow Internet\nPossible solutions:\n- Check for high bandwidth usage in your home.\n- Restart your modem and router.\n- Contact your ISP if the issue persists."
    }
    
    doc_names.extend(custom_docs.keys())
    doc_texts.extend(custom_docs.values())
    
    # Tokenize documents using BERT tokenizer
    tokenized_docs = [tokenize_text(doc) for doc in doc_texts]
    bm25 = BM25Okapi(tokenized_docs)
    
    index = {
        "bm25": bm25,
        "doc_names": doc_names,
        "tokenized_docs": tokenized_docs,
        "documents": {name: text for name, text in zip(doc_names, doc_texts)}
    }
    
    with open(INDEX_FILE, "wb") as f:
        pickle.dump(index, f)
    print("BM25 index built and saved with BERT tokenization.")

def load_index():
    """Loads the BM25 index from disk."""
    if not os.path.exists(INDEX_FILE):
        print("No index found. Building new index...")
        build_index()
    with open(INDEX_FILE, "rb") as f:
        return pickle.load(f)

def query_chatbot(query):
    """Searches for the most relevant document given a query using BM25."""
    index = load_index()
    bm25, doc_names, tokenized_docs, documents = index.values()
    
    query_tokens = tokenize_text(query)
    scores = bm25.get_scores(query_tokens)
    best_match_idx = np.argmax(scores)
    
    best_doc_name = doc_names[best_match_idx]
    best_doc_text = documents[best_doc_name]
    
    return f"Relevant Document: {best_doc_name}\n{best_doc_text[:500]}..." 


print("Welcome to the Support Chatbot!")
user_input = input("You: ")
if user_input.lower() in ["exit", "quit"]:
    print("Goodbye!")
response = query_chatbot(user_input)
print("Chatbot:", response)


Welcome to the Support Chatbot!
Chatbot: Relevant Document: Doc 8492
I am wanting to upgrade from a 386SX-25, to a 486DX-33, and are looking at a
cheap quote from someone offering me a 486DX-33 motherboard, with no ram in
it.  (I will probably sell my old m-board off somewhere)
Now, I have 4 meg of RAM in my 386, which consists of

4 x 9 module 1024KB simms, running at 70 nanoseconds.
    ^^^^^^^^                          ^^^^^^^^^^^^^^
Would I encounter problems with the pointed out areas, by throwing these from
one computer to the other?...
