In [35]:
import re
import json
import os
import faiss
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel


In [36]:

input_file = "scrape11.txt"
output_json = "cleaned_data.json"
faiss_index_file = "index.faiss"

if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found. Please ensure the file exists.")
    exit()

with open(input_file, "r", encoding="utf-8") as file:
    data = file.readlines()

# Clean the Text
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text  # Preserve case & punctuation for better meaning


# Apply cleaning and remove duplicates
cleaned_data = list(set([clean_text(line) for line in data if line.strip() != ""]))

# Chunking (Split Large Text into Small Sections)
def chunk_text(text_list, chunk_size=200):
    chunks = []
    chunk = ""
    for line in text_list:
        if len(chunk) + len(line) <= chunk_size:
            chunk += " " + line
        else:
            chunks.append(chunk.strip())
            chunk = line
    if chunk:
        chunks.append(chunk.strip())  # Add last chunk if needed
    return chunks

final_chunks = chunk_text(cleaned_data, chunk_size=200)

# Save Processed Data as JSON
json_data = [{"id": i, "text": chunk} for i, chunk in enumerate(final_chunks)]
with open(output_json, "w", encoding="utf-8") as file:
    json.dump(json_data, file, indent=4)

print("Preprocessed Data Saved as 'cleaned_data.json'.")




Preprocessed Data Saved as 'cleaned_data.json'.


In [37]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    """Convert text into an embedding"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()

# Generate Embeddings for Each Chunk
text_chunks = [entry["text"] for entry in json_data]
embeddings = np.array([get_embedding(text)[0] for text in text_chunks], dtype="float32")

# Create FAISS Index
embedding_size = embeddings.shape[1]  # Get embedding size dynamically
index = faiss.IndexFlatL2(embedding_size)
index.add(embeddings)

# Save FAISS Index
faiss.write_index(index, faiss_index_file)
print(" FAISS Index Created & Saved as 'index.faiss'.")

### Step 3: Search in FAISS ###

# Load FAISS Index
if not os.path.exists(faiss_index_file):
    print(f" Error: '{faiss_index_file}' not found. Please ensure the index was created properly.")
    exit()

index = faiss.read_index(faiss_index_file)


 FAISS Index Created & Saved as 'index.faiss'.


In [38]:
 query_text = "director academic"

In [43]:
query_embedding = get_embedding(query_text).astype("float32")

k = 5  # Retrieve more results for better accuracy
distances, indices = index.search(query_embedding, k)
retrieved_docs = [text_chunks[i] for i in indices[0]]

print("\n Top Matches:\n")
for i, doc in enumerate(retrieved_docs, 1):
    print(f" Match {i}: {doc}\n")


 Top Matches:

 Match 1: contact us timings from 0930am to 0400pm only on working days sno name of the faculty designation role email id 1 drtvg sridevi assistant professor hod aimlhodkmitin

 Match 2: director academic mrs deepa ganu director academic about deepa ganu deepa ganu the charismatic academic director of keshav memorial institute of technology has always been a proactive person and she has constructively involved herself in all the activities of the college she graduated b tech in ece from pune university and mtech from jntu hyderabad she consistently displays a fervent zeal in the academic progress of the students and the staff she strongly believes in herself and has immense trust in her mentors her peers and her team she took up a mission to ignite the young minds and nurture them with the potential to reach greater horizons she is a studentfriendly person and her forte is to identify their technical edge and channelize their skills in the right direction with her valua