In [None]:

!pip install langchain requests flask flask-restful faiss-cpu transformers unstructured torch sentence-transformers

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting flask-restful
  Downloading Flask_RESTful-0.3.10-py2.py3-none-any.whl (26 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting unstructured
  Downloading unstructured-0.14.9-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flask-ngrok
  Downloa

# Scraper

In [None]:
!pip install langchain-community
from langchain.document_loaders import UnstructuredURLLoader

def load_course_data():
    urls = ["https://brainlox.com/courses/category/technical"]
    loader = UnstructuredURLLoader(urls=urls)
    documents = loader.load()
    return documents

Collecting langchain-community
  Downloading langchain_community-0.2.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain-community
Successfully installed langchain-community-0.2.6


In [None]:
def dump_documents_to_file(documents, filename="scraped_data.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        for i, doc in enumerate(documents):
            f.write(f"Document {i+1}:\n")
            f.write(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
            f.write(f"Content:\n{doc.page_content}\n")
            f.write("\n" + "="*50 + "\n\n")

    print(f"Scraped data has been written to {filename}")

In [None]:
def verify_scraping():
    documents = load_course_data()
    dump_documents_to_file(documents)
    print(f"Number of documents scraped: {len(documents)}")

In [None]:
if __name__ == "__main__":
    verify_scraping()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Scraped data has been written to scraped_data.txt
Number of documents scraped: 1


# Embedding

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter

def create_vector_store():
    documents = load_course_data()
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    split_docs = text_splitter.split_documents(documents)
    dump_documents_to_file(split_docs, "split_data_for_vectorstore.txt")
    embeddings = HuggingFaceEmbeddings()
    vector_store = FAISS.from_documents(documents, embeddings)
    return vector_store

# App

In [None]:
from flask import Flask, request, jsonify, render_template_string
from flask_restful import Api, Resource
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [None]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from langchain.llms import HuggingFacePipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.7,
    do_sample=True
)

llm = HuggingFacePipeline(pipeline=pipe)

In [None]:

vector_store = create_vector_store()
qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    vector_store.as_retriever(),
    return_source_documents=True
)

Scraped data has been written to split_data_for_vectorstore.txt




In [None]:
class ChatbotAPI(Resource):
    def post(self):
        data = request.get_json()
        query = data.get('query')
        chat_history = data.get('chat_history', [])

        if not query:
            return jsonify({"error": "Missing 'query' in request"}), 400

        formatted_history = [(h[0], h[1]) for h in chat_history]

        result = qa_chain.invoke({"question": query, "chat_history": formatted_history})

        response = {
            "answer": result['answer'],
            "source_documents": [doc.page_content for doc in result['source_documents']]
        }

        return jsonify(response)


In [None]:
#from google.colab.output import eval_js
#print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://bbgpigmfwd-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Brainlox Technical Courses Chatbot</title>
    <style>
        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
        #chatbox { height: 300px; border: 1px solid #ddd; overflow-y: scroll; padding: 10px; margin-bottom: 20px; }
        #userInput { width: 70%; padding: 5px; }
        #sendButton { padding: 5px 15px; }
    </style>
</head>
<body>
    <h1>Brainlox Technical Courses Chatbot</h1>
    <div id="chatbox"></div>
    <input type="text" id="userInput" placeholder="Ask about Brainlox technical courses...">
    <button id="sendButton">Send</button>

    <script>
        const chatbox = document.getElementById('chatbox');
        const userInput = document.getElementById('userInput');
        const sendButton = document.getElementById('sendButton');
        let chatHistory = [];

        sendButton.addEventListener('click', sendMessage);
        userInput.addEventListener('keypress', function(e) {
            if (e.key === 'Enter') {
                sendMessage();
            }
        });

        function sendMessage() {
            const message = userInput.value.trim();
            if (message) {
                appendMessage('User', message);
                fetchResponse(message);
                userInput.value = '';
            }
        }

        function appendMessage(sender, message) {
            const messageElement = document.createElement('p');
            messageElement.innerHTML = `<strong>${sender}:</strong> ${message}`;
            chatbox.appendChild(messageElement);
            chatbox.scrollTop = chatbox.scrollHeight;
        }

        function fetchResponse(query) {
            fetch('/chat', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({
                    query: query,
                    chat_history: chatHistory
                }),
            })
            .then(response => response.json())
            .then(data => {
                appendMessage('Bot', data.answer);
                chatHistory.push([query, data.answer]);
            })
            .catch((error) => {
                console.error('Error:', error);
                appendMessage('Bot', 'Sorry, I encountered an error.');
            });
        }
    </script>
</body>
</html>
'''

In [None]:
#from flask_ngrok import run_with_ngrok
app = Flask(__name__)
api = Api(app)
api.add_resource(ChatbotAPI, '/chat')

@app.route('/')
def index():
    return render_template_string(HTML_TEMPLATE)


In [None]:
#!pip install pyngrok
#import os
#from pyngrok import ngrok
#ngrok.set_auth_token("k")

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [None]:
from google.colab import output
if __name__ == '__main__':
    output.serve_kernel_port_as_window(5000)
    app.run()

<IPython.core.display.Javascript object>

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:51:38] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:51:38] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:52:29] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:53:35] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:54:49] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:55:56] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [02/Jul/2024 13:57:26] "POST /chat HTTP/1.1" 200 -
