In [1]:
import re
import json
import os
import faiss
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

### Step 1: Load and Preprocess Data ###

# Load the Scraped Data
input_file = "scrape11.txt"
output_json = "cleaned_data.json"
faiss_index_file = "index.faiss"

if not os.path.exists(input_file):
    print(f"❌ Error: '{input_file}' not found. Please ensure the file exists.")
    exit()

with open(input_file, "r", encoding="utf-8") as file:
    data = file.readlines()

# Clean the Text
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces and newlines
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    return text.strip().lower()  # Convert to lowercase

# Apply cleaning and remove duplicates
cleaned_data = list(set([clean_text(line) for line in data if line.strip() != ""]))

# Chunking (Split Large Text into Small Sections)
def chunk_text(text_list, chunk_size=200):
    chunks = []
    chunk = ""
    for line in text_list:
        if len(chunk) + len(line) <= chunk_size:
            chunk += " " + line
        else:
            chunks.append(chunk.strip())
            chunk = line
    if chunk:
        chunks.append(chunk.strip())  # Add last chunk if needed
    return chunks

final_chunks = chunk_text(cleaned_data, chunk_size=200)

# Save Processed Data as JSON
json_data = [{"id": i, "text": chunk} for i, chunk in enumerate(final_chunks)]
with open(output_json, "w", encoding="utf-8") as file:
    json.dump(json_data, file, indent=4)

print("✅ Step 1 Completed: Preprocessed Data Saved as 'cleaned_data.json'.")

### Step 2: Generate Embeddings and Create FAISS Index ###

# Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    """Convert text into an embedding"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()

# Generate Embeddings for Each Chunk
text_chunks = [entry["text"] for entry in json_data]
embeddings = np.array([get_embedding(text)[0] for text in text_chunks], dtype="float32")

# Create FAISS Index
embedding_size = embeddings.shape[1]  # Get embedding size dynamically
index = faiss.IndexFlatL2(embedding_size)
index.add(embeddings)

# Save FAISS Index
faiss.write_index(index, faiss_index_file)
print("✅ Step 2 Completed: FAISS Index Created & Saved as 'index.faiss'.")

### Step 3: Search in FAISS ###

# Load FAISS Index
if not os.path.exists(faiss_index_file):
    print(f"❌ Error: '{faiss_index_file}' not found. Please ensure the index was created properly.")
    exit()

index = faiss.read_index(faiss_index_file)

# Example Query
query_text = "What is the process for applying?"
query_embedding = get_embedding(query_text).astype('float32')

# Search in FAISS
k = 3  # Retrieve top 3 results
distances, indices = index.search(query_embedding, k)

# Retrieve the actual text documents
retrieved_docs = [text_chunks[i] for i in indices[0]]
print("\n🔍 Top Matches:", retrieved_docs)


✅ Step 1 Completed: Preprocessed Data Saved as 'cleaned_data.json'.
✅ Step 2 Completed: FAISS Index Created & Saved as 'index.faiss'.

🔍 Top Matches: ['btech programs the eligibility criteria for admission to btech program admissions to the btech program are made along with the other engineering colleges in the state through a common entrance test eapcet conducted by the govt of telangana state the admission pattern to b tech is as follows the minimum qualification for admission to first year of the b tech course is a pass in the intermediate 10 2 conducted by the board of intermediate education govt of telangana state or any other examination recognized as equivalent thereto with mathematics physics and chemistry as optional subjects admission 70 of the seats are allotted based on the merit in the eapcet 30 of the seats are earmarked for managementnri candidates in addition to the above diploma holders are admitted in second year of b tech to the extent of 20 of intake based on the me

In [3]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import faiss
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

app = Flask(__name__)
CORS(app)  # Allows frontend (React) to call the API

# Load Preprocessed Data
with open("cleaned_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Load FAISS Index
faiss_index_file = "index.faiss"
index = faiss.read_index(faiss_index_file)

# Load Model & Tokenizer for Embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Function to generate embeddings
def get_embedding(text):
    """Converts text into an embedding"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()

@app.route("/search", methods=["POST"])
def search():
    try:
        # Get user query
        user_query = request.json.get("query", "")
        if not user_query:
            return jsonify({"error": "Query cannot be empty"}), 400

        # Convert query to embedding
        query_embedding = get_embedding(user_query).astype('float32')

        # Search FAISS Index
        k = 3  # Number of top results
        distances, indices = index.search(query_embedding, k)

        # Retrieve matched text chunks
        retrieved_docs = [data[i]["text"] for i in indices[0]]

        return jsonify({"query": user_query, "results": retrieved_docs})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.56.49:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1