In [4]:
pip install flask flask-cors

Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.9 (from flask)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.1.0-py3-none-any.whl (102 kB)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: itsdangerous, blinker, flask, flask-cors
Successfully installed blinker-1.9.0 flask-3.1.0 flask-cors-5.0.1 itsdangerous-2.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import re
import json
import os
import faiss
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# File paths
DATA_FILE = "cleaned_data.json"
FAISS_INDEX_FILE = "index.faiss"

# Load the model & tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def get_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()
input_file = "scrape11.txt"

if not os.path.exists(input_file):
    print(f"❌ Error: '{input_file}' not found.")
else:
    with open(input_file, "r", encoding="utf-8") as file:
        data = file.readlines()

    def chunk_text(text_list, chunk_size=200):
        chunks = []
        chunk = ""
        for line in text_list:
            line = clean_text(line)
            if len(chunk) + len(line) <= chunk_size:
                chunk += " " + line
            else:
                chunks.append(chunk.strip())
                chunk = line
        if chunk:
            chunks.append(chunk.strip())
        return chunks

    cleaned_data = list(set([line.strip() for line in data if line.strip() != ""]))
    final_chunks = chunk_text(cleaned_data)

    json_data = [{"id": i, "text": chunk} for i, chunk in enumerate(final_chunks)]
    with open(DATA_FILE, "w", encoding="utf-8") as file:
        json.dump(json_data, file, indent=4)

    print(f"✅ Preprocessed data saved as '{DATA_FILE}'")
with open(DATA_FILE, "r", encoding="utf-8") as file:
    json_data = json.load(file)

text_chunks = [entry["text"] for entry in json_data]
embeddings = np.array([get_embedding(text)[0] for text in text_chunks], dtype="float32")

embedding_size = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_size)
index.add(embeddings)
faiss.write_index(index, FAISS_INDEX_FILE)

print(f"✅ FAISS index saved as '{FAISS_INDEX_FILE}'")
# Load FAISS index and JSON
index = faiss.read_index(FAISS_INDEX_FILE)
with open(DATA_FILE, "r", encoding="utf-8") as file:
    documents = json.load(file)
text_chunks = [entry["text"] for entry in documents]

def extract_short_answer(full_text, query):
    sentences = re.split(r'(?<=[.!?]) +', full_text)
    query_words = set(query.lower().split())
    best_sentence = ""
    max_overlap = 0
    for sentence in sentences:
        sentence_words = set(sentence.lower().split())
        overlap = len(query_words & sentence_words)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sentence = sentence
    return best_sentence if best_sentence else full_text

def search_faiss(query, top_k=3):
    query = clean_text(query)
    query_embedding = get_embedding(query).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    retrieved_docs = [text_chunks[i] for i in indices[0]]
    if not retrieved_docs:
        return "Sorry, I couldn't find a relevant response."
    return extract_short_answer(retrieved_docs[0], query)
from flask import Flask, request, jsonify
from flask_cors import CORS
import threading

app = Flask(__name__)
CORS(app)

@app.route('/chat', methods=['POST'])
def chat():
    user_input = request.json.get('message', '')
    if not user_input:
        return jsonify({'response': "Please enter a message."})
    result = search_faiss(user_input)
    return jsonify({'response': result})

def run_flask():
    app.run(host='0.0.0.0', port=5000)

threading.Thread(target=run_flask).start()
print("🚀 Chatbot API running at http://localhost:5000/chat")


✅ Preprocessed data saved as 'cleaned_data.json'
✅ FAISS index saved as 'index.faiss'
🚀 Chatbot API running at http://localhost:5000/chat
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.0.103:5000
Press CTRL+C to quit
192.168.0.103 - - [05/Apr/2025 23:23:48] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [05/Apr/2025 23:24:45] "OPTIONS /chat HTTP/1.1" 200 -
127.0.0.1 - - [05/Apr/2025 23:24:45] "POST /chat HTTP/1.1" 200 -
127.0.0.1 - - [05/Apr/2025 23:24:58] "OPTIONS /chat HTTP/1.1" 200 -
127.0.0.1 - - [05/Apr/2025 23:24:59] "POST /chat HTTP/1.1" 200 -
