<a href="https://colab.research.google.com/github/Ankitha2003/AI-Powered-Virtual-Analyst/blob/main/Ai_Powered_Virtual_analyst.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy pandas faiss-cpu sentence-transformers gradio python-docx




In [4]:
from docx import Document

# Function to extract text from a .docx file
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = [para.text for para in doc.paragraphs if para.text.strip()]
    return " ".join(text)

# Extract text from all the uploaded reports
doc1_text = extract_text_from_docx("Annual Report 2020-21 - FINAL.docx")
doc2_text = extract_text_from_docx("Annual Report 2021-22 - FINAL.docx")
doc3_text = extract_text_from_docx("Annual Report 2022-23 - FINAL.docx")

# Combine all documents into one text corpus
corpus = [doc1_text, doc2_text, doc3_text]


In [5]:
from langchain.text_splitter import CharacterTextSplitter

# Split the text into manageable chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = []
for doc in corpus:
    chunks.extend(splitter.split_text(doc))

print(f"Number of chunks created: {len(chunks)}")




Number of chunks created: 6


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load a local embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
chunk_embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

# Create a FAISS vector store
dimension = chunk_embeddings.shape[1]  # Embedding size
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

print("FAISS index created with embeddings!")


FAISS index created with embeddings!


In [7]:
def search_documents(query, top_k=3):
    # Encode the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)

    # Search the FAISS index for the most relevant chunks
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the matching chunks
    results = [chunks[idx] for idx in indices[0]]
    return results

def answer_query(query):
    # Search for relevant chunks
    relevant_chunks = search_documents(query)

    # Combine the relevant chunks into one text
    context = " ".join(relevant_chunks)

    # Answer the query using the context (for simplicity, return the context)
    return context


In [9]:
import gradio as gr

# Define a function for the Gradio interface
def query_assistant(input_query):
    try:
        # Get the answer from the QA system
        answer = answer_query(input_query)
        return answer if answer.strip() else "No relevant information found."
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface
interface = gr.Interface(
    fn=query_assistant,
    inputs="text",
    outputs="text",
    title="AI Virtual Assistant for NABARD Reports",
    description="Ask any questions related to the NABARD annual reports."
)
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1aa5f1d9a98eba1097.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


