In [3]:
import json

# Load the cleaned JSON data
with open("ccleaned_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)




In [4]:

from difflib import get_close_matches

# Load the cleaned JSON file
with open("ccleaned_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Search function using fuzzy matching
def search_query(query, data, n_results=3):
    questions = [item["text"] for item in data]
    matches = get_close_matches(query, questions, n=n_results, cutoff=0.3)
    
    results = []
    for match in matches:
        for item in data:
            if item["question"] == match:
                results.append({"question": item["question"], "answer": item["answer"]})
    return results

# Example usage
query = "how do I reset my password?"
results = search_query(query, data)

# Display results
for i, res in enumerate(results, 1):
    print(f"Result {i}:")
    print("Q:", res["question"])
    print("A:", res["answer"])
    print()


In [2]:
import re
import json
import os
import faiss
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [7]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    """Convert text into an embedding"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()

# Generate Embeddings for Each Chunk
text_chunks = [entry["text"] for entry in data]
embeddings = np.array([get_embedding(text)[0] for text in text_chunks], dtype="float32")

# Create FAISS Index
embedding_size = embeddings.shape[1]  # Get embedding size dynamically
index = faiss.IndexFlatL2(embedding_size)
index.add(embeddings)

# Save FAISS Index
faiss_index_file = "index.faiss"
faiss.write_index(index, faiss_index_file)

print(" FAISS Index Created & Saved as 'index.faiss'.")

### Step 3: Search in FAISS ###

# Load FAISS Index
if not os.path.exists(faiss_index_file):
    print(f" Error: '{faiss_index_file}' not found. Please ensure the index was created properly.")
    exit()

index = faiss.read_index(faiss_index_file)


 FAISS Index Created & Saved as 'index.faiss'.


In [8]:

input_file = "scrape11.txt"
output_json = "cleaned_data.json"
faiss_index_file = "index.faiss"

if not os.path.exists(input_file):
    print(f"Error: '{input_file}' not found. Please ensure the file exists.")
    exit()

with open(input_file, "r", encoding="utf-8") as file:
    data = file.readlines()

# Clean the Text
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text  # Preserve case & punctuation for better meaning


# Apply cleaning and remove duplicates
cleaned_data = list(set([clean_text(line) for line in data if line.strip() != ""]))

# Chunking (Split Large Text into Small Sections)
def chunk_text(text_list, chunk_size=200):
    chunks = []
    chunk = ""
    for line in text_list:
        if len(chunk) + len(line) <= chunk_size:
            chunk += " " + line
        else:
            chunks.append(chunk.strip())
            chunk = line
    if chunk:
        chunks.append(chunk.strip())  # Add last chunk if needed
    return chunks

final_chunks = chunk_text(cleaned_data, chunk_size=200)

# Save Processed Data as JSON
json_data = [{"id": i, "text": chunk} for i, chunk in enumerate(final_chunks)]
with open(output_json, "w", encoding="utf-8") as file:
    json.dump(json_data, file, indent=4)

print("Preprocessed Data Saved as 'cleaned_data.json'.")




Preprocessed Data Saved as 'cleaned_data.json'.


In [9]:
query_text="Director Academic"
query_embedding = get_embedding(query_text).astype("float32")

k = 5  # Retrieve more results for better accuracy
distances, indices = index.search(query_embedding, k)
retrieved_docs = [text_chunks[i] for i in indices[0]]

print("\n Top Matches:\n")
for i, doc in enumerate(retrieved_docs, 1):
    print(f" Match {i}: {doc}\n")


 Top Matches:

 Match 1: Director Academic - Mrs. Deepa Ganu: Mrs. Deepa Ganu is a proactive and student-friendly academic director at KMIT. With a background in B.Tech (ECE) from Pune University and M.Tech from JNTU Hyderabad, she has been instrumental in mentoring students for various national and international platforms like Microsoft Imagine Cup, IBM's Great Mind Challenge, Google Summer of Code, and Amazon Campus Mentorship Series. She has also conducted corporate training for reputed companies like Verizon, HP, and CA Global, as well as government agencies such as DRDL and RCI. Mrs. Ganu is the author of the book 'Java Spoken Tutorials' under IIT Bombay's initiative and has received prestigious titles like 'Best Mentor' and 'Acharya' from major corporate companies. She has also been honored with the Drona Award for her contribution to pedagogy.

 Match 2: About the Humanities & Science (HS) Department: Established in 2007, the department offers a current intake of 840 students, 