In [89]:
# Cell 1: Install required packages
!pip install PyMuPDF nltk sentence-transformers faiss-cpu numpy streamlit transformers torch tqdm python-dotenv




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [90]:
# Cell 2: now Importing all necessary libraries
import fitz  # PyMuPDF
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pickle
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [91]:
# Cell 3: Define the TextProcessor class
class TextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF document"""
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    
    def extract_qa_from_txt(self, txt_path):
        """Extract Q&A pairs from text file"""
        with open(txt_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Split by Q/A pattern
        qa_pairs = re.split(r'Q\.\s*', content)[1:]
        qa_list = []
        
        for pair in qa_pairs:
            if 'A.' in pair:
                question, answer = pair.split('A.', 1)
                qa_list.append({
                    'question': question.strip(),
                    'answer': answer.strip()
                })
        return qa_list
    
    def preprocess_text(self, text):
        """Clean and preprocess text"""
        # Remove special characters and extra whitespace
        text = re.sub(r'[^a-zA-Z0-9\s.,;:?!-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize sentences
        sentences = sent_tokenize(text)
        
        # Tokenize and lemmatize words in each sentence
        processed_sentences = []
        for sent in sentences:
            words = word_tokenize(sent)
            lemmatized_words = [self.lemmatizer.lemmatize(word.lower()) for word in words]
            processed_sentences.append(' '.join(lemmatized_words))
            
        return processed_sentences
    
    def chunk_text(self, text, chunk_size=200):
        """Split text into chunks of approximately chunk_size words"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_count = 0
        
        for word in words:
            current_chunk.append(word)
            current_count += 1
            
            if current_count >= chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_count = 0
                
        if current_chunk:
            chunks.append(' '.join(current_chunk))
            
        return chunks

In [92]:
# Lets now Define the EmbeddingModel class
class EmbeddingModel:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    
    def generate_embeddings(self, texts):
        """Generate embeddings for list of texts"""
        return self.model.encode(texts, convert_to_tensor=False)
    
    def create_faiss_index(self, embeddings, index_path='faiss_index.idx'):
        """Create and save FAISS index"""
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        
        # Convert to float32 numpy array if needed
        if not isinstance(embeddings, np.ndarray):
            embeddings = np.array(embeddings)
        if embeddings.dtype != np.float32:
            embeddings = embeddings.astype('float32')
            
        index.add(embeddings)
        faiss.write_index(index, index_path)
        return index
    
    def load_faiss_index(self, index_path='faiss_index.idx'):
        """Load existing FAISS index"""
        if os.path.exists(index_path):
            return faiss.read_index(index_path)
        return None

In [93]:
# lets divide it into two part 

# ==================== Part 1: Model Loader ====================
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

print("Initializing T5 model...")

# Try loading progressively larger models
MODEL_LOADED = None
for model_name in ['t5-small', 't5-base']:
    try:
        print(f"Attempting to load {model_name}...")
        test_tokenizer = T5Tokenizer.from_pretrained(model_name)
        test_model = T5ForConditionalGeneration.from_pretrained(model_name)
        print(f"✓ Successfully loaded {model_name}")
        MODEL_LOADED = model_name
        break
    except Exception as e:
        print(f"✗ Failed to load {model_name}: {str(e)[:100]}...")

if not MODEL_LOADED:
    print(" Could not load any T5 models. Answer refinement will be disabled.")

# ==================== Part 2: Answer Generator ====================    
class AnswerGenerator:
    def __init__(self, model_name=MODEL_LOADED):
        self.available = bool(model_name)
        if not self.available:
            return
            
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        try:
            self.tokenizer = T5Tokenizer.from_pretrained(model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
            print(f"AnswerGenerator ready with {model_name}")
        except Exception as e:
            print(f"Failed to initialize AnswerGenerator: {e}")
            self.available = False
    
    def generate_answer(self, question, context):
        """Generate refined answer or return original context if unavailable"""
        if not self.available:
            return context
            
        try:
            input_text = f"question: {question} context: {context}"
            inputs = self.tokenizer.encode_plus(
                input_text,
                max_length=512,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            ).to(self.device)
            
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                num_beams=4,
                early_stopping=True
            )
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            print(f"Answer generation failed: {e}")
            return context

# Initialize (will self-configure based on what loaded)
answer_generator = AnswerGenerator()

Initializing T5 model...
Attempting to load t5-small...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✓ Successfully loaded t5-small
AnswerGenerator ready with t5-small


In [95]:
# ==================== FINAL WORKFLOW INTEGRATION ====================

# 1. First, verify the loaded model
print("\n=== MODEL VERIFICATION ===")
test_question = "What is stress?"
test_context = "Stress is the body's natural response to challenges. It can be positive (eustress) or negative (distress)."
print(f"Test Question: {test_question}")
print(f"Test Context: {test_context}")

if answer_generator.available:
    generated = answer_generator.generate_answer(test_question, test_context)
    print(f"Generated Answer: {generated}")
else:
    print("⚠️ Using simple mode - returning raw context")
    print(test_context)

# 2. Improved Answer Generation with Quality Control
class EnhancedAnswerGenerator(AnswerGenerator):
    def postprocess_answer(self, raw_answer):
        """Clean and format the generated answer"""
        # Remove repetitive phrases
        if "question:" in raw_answer.lower():
            raw_answer = raw_answer.split("question:")[0]
        
        # Capitalize first letter
        raw_answer = raw_answer.strip()
        if len(raw_answer) > 1:
            raw_answer = raw_answer[0].upper() + raw_answer[1:]
        
        # Ensure proper punctuation
        if raw_answer and raw_answer[-1] not in {'.', '!', '?'}:
            raw_answer += '.'
            
        return raw_answer

    def generate_answer(self, question, context):
        if not self.available:
            return context
            
        try:
            # Enhanced prompt engineering
            input_text = (
                f"Generate a concise answer to this university-related question. "
                f"Question: {question}\n"
                f"Context: {context[:2000]}\n"
                f"Answer:"
            )
            
            inputs = self.tokenizer(
                input_text,
                max_length=512,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(self.device)
            
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=0.7,
                num_beams=3,
                early_stopping=True,
                repetition_penalty=1.5
            )
            
            raw_answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return self.postprocess_answer(raw_answer)
            
        except Exception as e:
            print(f"Generation error: {str(e)[:200]}")
            return context[:500] + ("..." if len(context) > 500 else "")

# 3. Upgrade the answer generator
enhanced_answer_generator = EnhancedAnswerGenerator(model_name=MODEL_LOADED)

# 4. Test with your actual Q&A pairs
print("\n=== ENHANCED GENERATION TEST ===")
test_cases = [
    ("Who teaches CSS101?", 
     "Faculty for CSS101: Ms. Garima Sharma, Dr. Sandeep Singh"),
     
    ("What is Dr. Singh's email?", 
     "Contact: sandeepsingh@ncuindia.edu"),
     
    ("What is stress?", 
     "Definition: Stress is the body's response to demands")
]

for question, context in test_cases:
    print(f"\nQ: {question}")
    print(f"Context: {context}")
    print(f"Answer: {enhanced_answer_generator.generate_answer(question, context)}")


=== MODEL VERIFICATION ===
Test Question: What is stress?
Test Context: Stress is the body's natural response to challenges. It can be positive (eustress) or negative (distress).
Generated Answer: Stress is the body's natural response to challenges. It can be positive (eustress) or negative (distress).


TypeError: __init__() got an unexpected keyword argument 'model_name'

In [94]:
# Cell 5 - ULTRA-ROBUST VERSION (works without internet/model downloads)

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

class AnswerGenerator:
    def __init__(self):
        self.available = False
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Try to load any available model
        for model_name in ['t5-small', 't5-base']:
            try:
                self.tokenizer = T5Tokenizer.from_pretrained(model_name)
                self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
                self.available = True
                print(f"✓ Loaded {model_name} for answer refinement")
                break
            except:
                continue
        
        if not self.available:
            print("⚠️ Running in simple mode (using direct text snippets)")
    
    def generate_answer(self, question, context):
        """Returns either refined answer or original context"""
        if not self.available:
            # Fallback: return the most relevant text chunk
            return context
            
        try:
            input_text = f"question: {question} context: {context}"
            inputs = self.tokenizer.encode_plus(
                input_text,
                max_length=512,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            ).to(self.device)
            
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                num_beams=4,
                early_stopping=True
            )
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except:
            return context

# Initialize (will automatically configure based on what's available)
answer_generator = AnswerGenerator()

✓ Loaded t5-small for answer refinement


In [96]:
import requests
try:
    response = requests.get("https://huggingface.co", timeout=5)
    print("✓ Can access Hugging Face" if response.status_code == 200 else "✗ Blocked from Hugging Face")
except Exception as e:
    print(f"✗ Connection failed: {e}")

✓ Can access Hugging Face


In [31]:
# Cell 5 - 100% WORKING VERSION (online or offline)

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

class AnswerGenerator:
    def __init__(self):
        self.available = False
        self.device = torch.device('cpu')  # Force CPU to avoid CUDA issues
        
        # Try loading with multiple approaches
        try:
            # Attempt direct load (will work if previously downloaded)
            self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
            self.model = T5ForConditionalGeneration.from_pretrained('t5-small').to(self.device)
            self.available = True
            print("✓ Answer refinement enabled")
        except:
            print("⚠️ Using simple mode (direct text answers)")
    
    def generate_answer(self, question, context):
        if not self.available:
            # Return the 3 most relevant sentences
            sentences = context.split('. ')
            return '. '.join(sentences[:3]) + ('.' if len(sentences) >=3 else '')
        
        try:
            input_text = f"question: {question} context: {context[:2000]}"  # Limit context size
            inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True).to(self.device)
            outputs = self.model.generate(**inputs, max_length=150)
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        except:
            return context[:500]  # Fallback to first 500 chars

answer_generator = AnswerGenerator()

⚠️ Using simple mode (direct text answers)


In [32]:
!pip install transformers accelerate sentence-transformers
!pip install torch --upgrade




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting torch
  Downloading torch-2.7.0-cp39-cp39-win_amd64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.7.0-cp39-cp39-win_amd64.whl (212.4 MB)
   ---------------------------------------- 212.4/212.4 MB 8.3 MB/s eta 0:00:00
Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
   ---------------------------------------- 6.3/6.3 MB 6.5 MB/s eta 0:00:00
Installing collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
  Attempting uninstall: torch
    Found existing installation: torch 2.1.2
    Uninstalling torch-2.1.2:

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.2 requires torch==2.1.2, but you have torch 2.7.0 which is incompatible.
torchvision 0.16.2 requires torch==2.1.2, but you have torch 2.7.0 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



      Successfully uninstalled torch-2.1.2
Successfully installed sympy-1.14.0 torch-2.7.0


In [33]:
# Cell 5 - Llama 2 Version (with fallback)
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class AnswerGenerator:
    def __init__(self):
        self.available = False
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Try loading Llama 2 (7B Chat) - smaller and better than T5
        try:
            print("🔄 Attempting to load Llama 2...")
            model_name = "meta-llama/Llama-2-7b-chat-hf"
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.available = True
            print("✅ Llama 2 loaded successfully!")
        except Exception as e:
            print(f"⚠️ Could not load Llama 2: {str(e)[:200]}...")
            print("🔶 Falling back to simple mode (direct text answers).")

    def generate_answer(self, question, context):
        if not self.available:
            # Fallback: return the most relevant part of the context
            sentences = context.split('. ')
            return '. '.join(sentences[:3]) + ('.' if len(sentences) >= 3 else '')
        
        try:
            prompt = f"""
            [INST] <<SYS>>
            You are a helpful university assistant. Answer the student's question using ONLY the provided context.
            <</SYS>>
            
            Question: {question}
            Context: {context[:3000]}  # Limit to avoid memory issues
            [/INST]
            """
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=0.7,
                do_sample=True
            )
            answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Clean up the output
            return answer.split("[/INST]")[-1].strip()
        except Exception as e:
            print(f"⚠️ Answer generation failed: {e}")
            return context[:500]  # Return first 500 chars as fallback

# Initialize
answer_generator = AnswerGenerator()

🔄 Attempting to load Llama 2...
⚠️ Could not load Llama 2: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https...
🔶 Falling back to simple mode (direct text answers).


In [34]:
from huggingface_hub import notebook_login

# Run this in a new cell
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [73]:
# Cell 5 - OFFLINE MODE (No Hugging Face models needed)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class AnswerGenerator:
    def __init__(self):
        print("✅ Using optimized offline mode (TF-IDF based)")
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
    def generate_answer(self, question, context):
        """Returns the most relevant part of the context using TF-IDF"""
        try:
            # Combine question and context for analysis
            texts = [question] + context.split('. ')
            
            # Create TF-IDF matrix
            tfidf_matrix = self.vectorizer.fit_transform(texts)
            
            # Calculate similarity between question and context sentences
            similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
            
            # Get top 3 most relevant sentences
            top_indices = np.argsort(similarities)[-3:][::-1]
            best_answers = [texts[i+1] for i in top_indices if similarities[i] > 0.1]
            
            return '. '.join(best_answers) if best_answers else context[:500]
        except:
            return context[:500]  # Fallback

# Initialize
answer_generator = AnswerGenerator()

✅ Using optimized offline mode (TF-IDF based)


In [74]:
test_question = "Who teaches Community Service?"
test_context = "The Community Service course (CSS101) is taught by: Ms. Garima Sharma, Dr. Sandeep Singh, and Dr. Rita Chhikara. Classes are held on Mondays."

print(answer_generator.generate_answer(test_question, test_context))

The Community Service course (CSS101) is taught by: Ms


In [75]:
# Cell 5 - IMPROVED OFFLINE VERSION (complete answers)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

class AnswerGenerator:
    def __init__(self):
        print("✅ Using enhanced offline mode (complete answers)")
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
    def generate_answer(self, question, context):
        """Returns complete sentences with proper formatting"""
        try:
            # Split context into proper sentences (handling abbreviations)
            sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', context) if s.strip()]
            
            # Add question to the text pool
            texts = [question] + sentences
            
            # Create TF-IDF matrix
            tfidf_matrix = self.vectorizer.fit_transform(texts)
            
            # Calculate similarities
            similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
            
            # Get top 3 relevant sentences (minimum similarity threshold)
            top_indices = np.argsort(similarities)[-3:][::-1]
            best_answers = [sentences[i] for i in top_indices if similarities[i] > 0.1]
            
            # Formatting fixes
            answer = ' '.join(best_answers)
            answer = re.sub(r'\s([,.?])', r'\1', answer)  # Fix spaces before punctuation
            answer = answer[0].upper() + answer[1:]        # Capitalize first letter
            return answer if answer.endswith(('.','!','?')) else answer + '.'
            
        except Exception as e:
            print(f"Debug: {str(e)}")  # Only visible during development
            return context[:500]  # Fallback

# Initialize
answer_generator = AnswerGenerator()

✅ Using enhanced offline mode (complete answers)


In [76]:
test_context = "The Community Service course (CSS101) is taught by: Ms. Garima Sharma, Dr. Sandeep Singh, and Dr. Rita Chhikara. Classes are held on Mondays."
print(answer_generator.generate_answer("Who teaches CSS101?", test_context))

The Community Service course (CSS101) is taught by: Ms.


In [97]:
# Cell 5 - FINAL BULLETPROOF VERSION
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

class AnswerGenerator:
    def __init__(self):
        print("✅ Using ultra-reliable answer generation")
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
    def generate_answer(self, question, context):
        """Returns complete, well-formatted answers every time"""
        try:
            # Step 1: Better sentence splitting that handles titles (Ms., Dr., etc.)
            sentences = [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context)]
            
            # Step 2: Find the most relevant sentence
            if len(sentences) == 0:
                return context[:500]
                
            question_vec = self.vectorizer.fit_transform([question])
            sentence_vecs = self.vectorizer.transform(sentences)
            
            similarities = cosine_similarity(question_vec, sentence_vecs).flatten()
            best_idx = np.argmax(similarities)
            best_answer = sentences[best_idx]
            
            # Step 3: Ensure we return complete information
            if ":" in best_answer:  # If answer contains a list (like faculty names)
                for s in sentences[best_idx+1:]:
                    if not s[0].isupper():  # Continuation of the list
                        best_answer += " " + s
                    else:
                        break
                        
            # Step 4: Final formatting
            best_answer = best_answer.replace("  ", " ")
            if not best_answer.endswith(('.','!','?')):
                best_answer += '.'
                
            return best_answer[0].upper() + best_answer[1:]
            
        except Exception as e:
            print(f"Debug: {str(e)}")
            return context[:500]

answer_generator = AnswerGenerator()

✅ Using ultra-reliable answer generation


In [98]:
test_context = "The Community Service course (CSS101) is taught by: Ms. Garima Sharma, Dr. Sandeep Singh, and Dr. Rita Chhikara. Classes are held on Mondays."
print(answer_generator.generate_answer("Who teaches CSS101?", test_context))

The Community Service course (CSS101) is taught by: Ms. Garima Sharma, Dr. Sandeep Singh, and Dr. Rita Chhikara.


In [99]:
def get_answer(question, faiss_index, pdf_chunks, qa_embeddings):
    # 1. First i am trying Q&A database
    qa_answer = get_qa_answer(question, qa_embeddings)
    if qa_answer:
        return qa_answer
    
    # 2. Searching for the  PDF chunks if no Q&A match
    similar_chunks = search_similar_text(question, faiss_index, pdf_chunks)
    if similar_chunks:
        context = ' '.join(similar_chunks[:3])  # Use top 3 chunks
        return answer_generator.generate_answer(question, context)
    
    return "I couldn't find information about this topic."

In [100]:
# Example usage
question = "Where is the CSS101 class held?"
context = "CSS101 classes are held in Block B, Room 205 every Monday and Wednesday."

print(answer_generator.generate_answer(question, context))

CSS101 classes are held in Block B, Room 205 every Monday and Wednesday.


In [81]:
# Very short context
print(answer_generator.generate_answer("Who is Dr. Singh?", "Dr. Sandeep Singh (sandeepsingh@ncuindia.edu)"))

# Long context
long_text = "The campus has 3 blocks. Block A has CS dept. Block B has faculty offices. Block C has labs."
print(answer_generator.generate_answer("Where are faculty offices?", long_text))

Dr. Sandeep Singh (sandeepsingh@ncuindia.edu).
Block B has faculty offices.


In [101]:
# Cell 4: Bulletproof Answer Generator
class AnswerGenerator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def generate_answer(self, question, context):
        sentences = [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context)]
        if not sentences:
            return context[:500]
            
        question_vec = self.vectorizer.fit_transform([question])
        sentence_vecs = self.vectorizer.transform(sentences)
        
        similarities = cosine_similarity(question_vec, sentence_vecs).flatten()
        best_idx = np.argmax(similarities)
        best_answer = sentences[best_idx]
        
        # Handle list continuations (e.g., faculty names)
        if ":" in best_answer:
            for s in sentences[best_idx+1:]:
                if not s[0].isupper():  # Continuation line
                    best_answer += " " + s
                else:
                    break
                    
        return best_answer[0].upper() + best_answer[1:] + ('' if best_answer.endswith(('.','!','?')) else '.')

In [102]:
# Cell 5: Complete Chatbot Engine
class CollegeChatbot:
    def __init__(self, pdf_path, qa_path):
        self.processor = TextProcessor()
        self.answer_gen = AnswerGenerator()
        
        # Load data
        self.pdf_text = self.processor.extract_text_from_pdf(pdf_path)
        self.qa_pairs = self.processor.extract_qa_from_txt(qa_path)
        
        # Prepare FAISS index
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.pdf_chunks = self.processor.chunk_text(self.pdf_text)
        self.qa_questions = [qa['question'] for qa in self.qa_pairs]
        
        # Combine all texts for embedding
        all_texts = self.pdf_chunks + self.qa_questions
        self.embeddings = self.model.encode(all_texts)
        
        # Create FAISS index
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings.astype('float32'))
        
    def query(self, question):
        # 1. Check Q&A pairs first
        question_embed = self.model.encode([question])
        qa_similarities = np.dot(self.embeddings[len(self.pdf_chunks):], question_embed.T).flatten()
        best_qa_idx = np.argmax(qa_similarities)
        
        if qa_similarities[best_qa_idx] > 0.7:  # Similarity threshold
            return self.qa_pairs[best_qa_idx]['answer']
        
        # 2. Search PDF chunks
        _, pdf_indices = self.index.search(question_embed.astype('float32'), k=3)
        context = ' '.join([self.pdf_chunks[idx] for idx in pdf_indices[0] if idx < len(self.pdf_chunks)])
        
        return self.answer_gen.generate_answer(question, context)

In [112]:
# Cell 6: Initialize and Run
# Set your file paths here
PDF_PATH = "Aptitude Exam Preparation Guide(1).pdf"  
QA_PATH = "questionnanswers.txt"

bot = CollegeChatbot(PDF_PATH, QA_PATH)

# Test it
print(bot.query("Who teaches Community Service?"))
print(bot.query("What is the schedule for CSS101?"))

FileNotFoundError: no such file: 'Aptitude Exam Preparation Guide(1).pdf'

In [113]:
class AnswerGenerator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def generate_answer(self, question, context):
        try:
            # Handle empty context
            if not context.strip():
                return "I couldn't find information about this topic."
                
            # Improved sentence splitting
            sentences = [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context) if s.strip()]
            
            if not sentences:
                return context[:500] + ('...' if len(context) > 500 else '')
            
            # Get most relevant sentence
            question_vec = self.vectorizer.fit_transform([question])
            sentence_vecs = self.vectorizer.transform(sentences)
            similarities = cosine_similarity(question_vec, sentence_vecs).flatten()
            best_idx = np.argmax(similarities)
            best_answer = sentences[best_idx]
            
            # Handle list continuations
            if ":" in best_answer:
                for s in sentences[best_idx+1:]:
                    if not s[0].isupper() and len(s.split()) < 10:  # Continuation line
                        best_answer += " " + s
                    else:
                        break
            
            # Final formatting (bulletproof)
            if not best_answer:
                return context[:500] + ('...' if len(context) > 500 else '')
                
            best_answer = best_answer.replace("  ", " ").strip()
            if not best_answer:
                return context[:500] + ('...' if len(context) > 500 else '')
                
            # Capitalize and punctuate
            if not best_answer[0].isupper():
                best_answer = best_answer[0].upper() + best_answer[1:]
            if not best_answer.endswith(('.','!','?')):
                best_answer += '.'
                
            return best_answer
            
        except Exception as e:
            print(f"Debug Error: {str(e)}")
            return context[:500] + ('...' if len(context) > 500 else '')

In [122]:
# Cell 6: Initialize and Run
# Setting my file paths here
PDF_PATH = "Aptitude Exam Preparation Guide.pdf"  
QA_PATH = "questionnanswers.txt"

bot = CollegeChatbot(PDF_PATH, QA_PATH)

# Test it
print(bot.query("Who teaches Community Service?"))
print(bot.query("What is the schedule for CSS101?"))

The Community Service (CSS101) course is taught by Ms. Garima Sharma, Dr. Sandeep Singh, Dr. Rita Chhikara, Dr. Anvesha Katti, and Dr. Yogita Gigra.
I couldn't find relevant information in the documents.


In [123]:
import os

# List files in current directory
print("Current directory:", os.getcwd())
print("Files present:", os.listdir())

Current directory: C:\Users\HP\Downloads
Files present: ['!DOCTYPE html.html', '!DOCTYPE.html', '(21csu278)Assignment 1 DE&CA.pdf', '(Aditya Sindhu )[Problem Statement 7](21csu278).docx', '(Aditya Sindhu)Document.pdf', '.env.ipynb', '.ipynb_checkpoints', '.opera', '.~banklist.csv.xlsx', '01_Drill_Functions.ipynb', '01_Drill_Numpy-2 (1).ipynb', '01_Drill_Numpy-2.ipynb', '01_Drill_Variables (1) (1).ipynb', '01_Drill_Variables (1).ipynb', '01_Drill_Variables (2) (1).ipynb', '01_Drill_Variables (2).ipynb', '01_Drill_Variables (3).ipynb', '01_Drill_Variables.ipynb', '01_Fashion MNIST Problem.ipynb', '01_Missing Data now.ipynb', '01_Notebook_control_struct.ipynb', '01_Notebook_Data and Expressions.ipynb', '01_Notebook_Distribution Plots.ipynb', '01_Notebook_Matplotlib Concepts.ipynb', '01_Notebook_Numpy Arrays (1).ipynb', '01_Notebook_Numpy Arrays (2).ipynb', '01_Notebook_Numpy Arrays.ipynb', '01_Project_Dataset Link (2) (1).txt', '01_Project_Dataset Link (2) (2).txt', '01_Project_Dataset Li

In [124]:
# Test with error cases
test_cases = [
    ("Who teaches CSS101?", "Normal query"),
    ("What is the syllabus for aptitude exam?", ""),  # Empty context
    ("Who is the faculty for Data Visualization (CSV201)?", "   "),  # Whitespace context
    ("When is the exam?", "Exam is on Monday")  # Short answer
]

for question, context in test_cases:
    print(f"Q: {question}")
    print(f"A: {answer_generator.generate_answer(question, context)}\n")

Q: Who teaches CSS101?
A: Normal query.

Q: What is the syllabus for aptitude exam?
A: .

Q: Who is the faculty for Data Visualization (CSV201)?
A: .

Q: When is the exam?
A: Exam is on Monday.



In [125]:
#  Robust Answer Generator
class AnswerGenerator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def generate_answer(self, question, context):
        try:
            # Handle empty context
            if not context or not context.strip():
                return "I don't have information about this topic."
            
            # Improved sentence splitting
            sentences = [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context) if s.strip()]
            
            if not sentences:
                return context[:500] + ('...' if len(context) > 500 else '')
            
            # Get most relevant sentences
            question_vec = self.vectorizer.fit_transform([question])
            sentence_vecs = self.vectorizer.transform(sentences)
            similarities = cosine_similarity(question_vec, sentence_vecs).flatten()
            
            # Get top 3 relevant sentences
            top_indices = np.argsort(similarities)[-3:][::-1]
            best_answers = []
            
            for idx in top_indices:
                if similarities[idx] > 0.1:  # Minimum similarity threshold
                    best_answers.append(sentences[idx])
            
            if not best_answers:
                return context[:500] + ('...' if len(context) > 500 else '')
            
            # Combine best answers
            final_answer = ' '.join(best_answers)
            
            # Clean up formatting
            final_answer = final_answer.replace("  ", " ").strip()
            if not final_answer:
                return context[:500] + ('...' if len(context) > 500 else '')
            
            # Ensure proper punctuation
            if not final_answer.endswith(('.','!','?')):
                final_answer += '.'
            
            return final_answer[0].upper() + final_answer[1:]
            
        except Exception as e:
            print(f"Debug Error: {str(e)}")
            return context[:500] + ('...' if len(context) > 500 else '')

In [126]:
# Enhanced Chatbot Class
class CollegeChatbot:
    def __init__(self, pdf_path, qa_path):
        self.processor = TextProcessor()
        self.answer_gen = AnswerGenerator()
        
        # Load data
        self.pdf_text = self.processor.extract_text_from_pdf(pdf_path)
        self.qa_pairs = self.processor.extract_qa_from_txt(qa_path)
        
        # Prepare embeddings
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.pdf_chunks = self.processor.chunk_text(self.pdf_text)
        self.qa_questions = [qa['question'] for qa in self.qa_pairs]
        
        # Create combined embeddings
        all_texts = self.pdf_chunks + self.qa_questions
        self.embeddings = self.model.encode(all_texts)
        
        # Create FAISS index
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings.astype('float32'))
        
    def query(self, question):
        # 1. Check Q&A pairs first
        question_embed = self.model.encode([question])
        qa_similarities = np.dot(
            self.embeddings[len(self.pdf_chunks):], 
            question_embed.T
        ).flatten()
        best_qa_idx = np.argmax(qa_similarities)
        
        if qa_similarities[best_qa_idx] > 0.7:  # Similarity threshold
            return self.qa_pairs[best_qa_idx]['answer']
        
        # 2. Search PDF chunks
        _, pdf_indices = self.index.search(question_embed.astype('float32'), k=3)
        context = ' '.join([
            self.pdf_chunks[idx] 
            for idx in pdf_indices[0] 
            if idx < len(self.pdf_chunks)
        ])
        
        if not context.strip():
            return "I couldn't find relevant information in the documents."
        
        return self.answer_gen.generate_answer(question, context)

In [127]:
#  Test Cases
bot = CollegeChatbot(PDF_PATH, QA_PATH)

test_questions = [
    "Who teaches CSS101?",
    "What is the syllabus for aptitude exam?",
    "Who is the faculty for Data Visualization (CSV201)?",
    "When is the exam?"
]

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {bot.query(question)}\n")

Q: Who teaches CSS101?
A: The Community Service (CSS101) course is taught by Ms. Garima Sharma, Dr. Sandeep Singh, Dr. Rita Chhikara, Dr. Anvesha Katti, and Dr. Yogita Gigra.

Q: What is the syllabus for aptitude exam?
A: I couldn't find relevant information in the documents.

Q: Who is the faculty for Data Visualization (CSV201)?
A: Prof. Michael Tibbetts is the faculty for Data Visualization (CSV201).

Q: When is the exam?
A: As this is the project for first year students therefore, we are also having the data for the Aptitute exam preparation.



In [128]:

import gradio as gr  # For the web interface

In [129]:
def chat_interface(question, history):
    return bot.query(question)

gr.ChatInterface(
    chat_interface,
    title="NCU College Assistant",
    description="Ask about courses, faculty, or exam preparation"
).launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




## Extra Execution

In [59]:
# Add this right after initializing your bot
print("\n=== DATA VERIFICATION ===")
print(f"Loaded {len(bot.pdf_chunks)} PDF chunks")
print("Sample PDF chunk:", bot.pdf_chunks[0][:100] + "...")
print(f"\nLoaded {len(bot.qa_pairs)} Q&A pairs")
print("Sample Q&A:", bot.qa_pairs[0])


=== DATA VERIFICATION ===
Loaded 37 PDF chunks
Sample PDF chunk: This Word is the dataset for the Final Year project. In this dataset we are taking the First year co...

Loaded 41 Q&A pairs
Sample Q&A: {'question': 'Who teaches the Community Service (CSS101) course?', 'answer': 'The Community Service (CSS101) course is taught by Ms. Garima Sharma, Dr. Sandeep Singh, Dr. Rita Chhikara, Dr. Anvesha Katti, and Dr. Yogita Gigra.'}


In [66]:
import fitz  # PyMuPDF
import re
import os
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')

class TextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF document"""
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    
    def extract_qa_from_txt(self, txt_path):
        """Improved Q&A extraction that handles various formats"""
        with open(txt_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Handle different Q&A formats
        qa_pairs = []
        current_q = None
        current_a = []
        
        for line in content.split('\n'):
            line = line.strip()
            if line.startswith(('Q.', 'Q:', 'Q ')) or line.startswith('Q'):
                if current_q is not None:
                    qa_pairs.append({
                        'question': current_q,
                        'answer': ' '.join(current_a)
                    })
                current_q = line[2:].strip() if line.startswith(('Q.', 'Q:')) else line[1:].strip()
                current_a = []
            elif line.startswith(('A.', 'A:', 'A ')) or line.startswith('A'):
                answer_part = line[2:].strip() if line.startswith(('A.', 'A:')) else line[1:].strip()
                current_a.append(answer_part)
            elif current_a:  # Continuation of answer
                current_a.append(line)
        
        if current_q is not None:
            qa_pairs.append({
                'question': current_q,
                'answer': ' '.join(current_a)
            })
            
        return qa_pairs
    
    def chunk_text(self, text, chunk_size=200):
        """Split text into chunks of approximately chunk_size words"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_count = 0
        
        for word in words:
            current_chunk.append(word)
            current_count += 1
            
            if current_count >= chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_count = 0
                
        if current_chunk:
            chunks.append(' '.join(current_chunk))
            
        return chunks
    
    def preprocess_text(self, text):
        """Clean and preprocess text"""
        text = re.sub(r'[^a-zA-Z0-9\s.,;?]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [67]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

class CollegeChatbot:
    def __init__(self, pdf_path, qa_path):
        self.processor = TextProcessor()
        self.qa_pairs = self.processor.extract_qa_from_txt(qa_path)
        
        # Create question-answer mapping for exact matches
        self.qa_map = {pair['question'].lower().strip(): pair['answer'] 
                      for pair in self.qa_pairs}
        
        # Initialize PDF processing if needed
        self.pdf_text = ""
        if pdf_path and os.path.exists(pdf_path):
            self.pdf_text = self.processor.extract_text_from_pdf(pdf_path)
            self.pdf_chunks = self.processor.chunk_text(self.pdf_text)
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.embeddings = self.model.encode(self.pdf_chunks)
            self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
            self.index.add(self.embeddings.astype('float32'))
    
    def query(self, question):
        # 1. First check for exact match (case insensitive)
        normalized_question = question.lower().strip()
        if normalized_question in self.qa_map:
            return self.qa_map[normalized_question]
        
        # 2. Check for partial matches in questions
        for q, a in self.qa_map.items():
            if q in normalized_question or normalized_question in q:
                return a
        
        # 3. If no Q&A match, search PDF (if available)
        if hasattr(self, 'index'):
            question_embed = self.model.encode([question])
            _, pdf_indices = self.index.search(question_embed.astype('float32'), k=1)
            if pdf_indices[0][0] < len(self.pdf_chunks):
                return self.pdf_chunks[pdf_indices[0][0]]
        
        return "I couldn't find information about this topic."

In [68]:
# Initialize with your paths
bot = CollegeChatbot(
    pdf_path="Aptitude Exam Preparation Guide.pdf",
    qa_path="questionnanswers.txt"
)

# Test with your questions
test_questions = [
    "Who teaches the Community Service (CSS101) course?",
    "What is the email address of Dr. Sandeep Singh?",
    "Which room is Dr. Aditya Sharma available in for Engineering Chemistry?",
    "What is the syllabus for the Aptitude Exam at NCU?"
]

for q in test_questions:
    print(f"Q: {q}")
    print(f"A: {bot.query(q)}\n")

Q: Who teaches the Community Service (CSS101) course?
A: The Community Service (CSS101) course is taught by Ms. Garima Sharma, Dr. Sandeep Singh, Dr. Rita Chhikara, Dr. Anvesha Katti, and Dr. Yogita Gigra. 

Q: What is the email address of Dr. Sandeep Singh?
A: Dr. Sandeep Singh’s email address is sandeepsingh@ncuindia.edu. 

Q: Which room is Dr. Aditya Sharma available in for Engineering Chemistry?
A: This Word is the dataset for the Final Year project. In this dataset we are taking the First year courses and their faculty members details. The labs room location in the campus. Add on to it we are taking info of Clubs, Events and of library. As this is the project for first year students therefore, we are also having the data for the Aptitute exam preparation. First comes, the first-year course and the faculty members for that: Course Course Code Facalty ID Room No. Community Service CSS101 Ms. Garima Sharma garimasharma@ncuindi a.edu 125 Dr.Sandeep Singh sandeepsingh@ncuindia .edu 108

In [60]:
def query(self, question):
    print(f"\nProcessing question: '{question}'")
    
    # 1. Check Q&A pairs first
    question_embed = self.model.encode([question])
    qa_similarities = np.dot(
        self.embeddings[len(self.pdf_chunks):], 
        question_embed.T
    ).flatten()
    best_qa_idx = np.argmax(qa_similarities)
    
    print(f"Best Q&A match: {self.qa_pairs[best_qa_idx]['question']} (score: {qa_similarities[best_qa_idx]:.2f})")
    
    if qa_similarities[best_qa_idx] > 0.7:  # Only use if highly confident
        return self.qa_pairs[best_qa_idx]['answer']
    
    # 2. Search PDF chunks
    _, pdf_indices = self.index.search(question_embed.astype('float32'), k=3)
    context = ' '.join([
        self.pdf_chunks[idx] 
        for idx in pdf_indices[0] 
        if idx < len(self.pdf_chunks)
    ])
    
    print(f"Retrieved PDF context: {context[:200]}...")
    
    if not context.strip():
        return "I couldn't find relevant information in our documents."
    
    return self.answer_gen.generate_answer(question, context)

In [61]:
class AnswerGenerator:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def generate_answer(self, question, context):
        try:
            # Handle cases where the context already contains the exact answer
            if "?" in question and question.split("?")[0].lower() + "?" in context.lower():
                relevant_part = context.split(question.split("?")[0].lower() + "?")[1].split(".")[0]
                return relevant_part.strip().capitalize()
            
            # Fallback to our previous robust method
            sentences = [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context) if s.strip()]
            
            if not sentences:
                return context[:300] + ('...' if len(context) > 300 else '')
            
            # Find the sentence that contains the most question keywords
            question_keywords = set(re.findall(r'\w+', question.lower()))
            best_sentence = max(
                sentences,
                key=lambda s: len(question_keywords.intersection(set(re.findall(r'\w+', s.lower())))),
                default=""
            )
            
            return best_sentence if best_sentence else context[:300] + '...'
            
        except Exception as e:
            print(f"Answer generation error: {e}")
            return context[:300] + '...'

In [62]:
test_questions = [
    "Who teaches CSS101?",
    "What is Dr. Singh's email?",
    "Where is the library located?",
    "When are the semester exams?"
]

print("=== TESTING ===")
for q in test_questions:
    print(f"\nQ: {q}")
    print(f"A: {bot.query(q)}")
    print("---")

=== TESTING ===

Q: Who teaches CSS101?
A: The Community Service (CSS101) course is taught by Ms. Garima Sharma, Dr. Sandeep Singh, Dr. Rita Chhikara, Dr. Anvesha Katti, and Dr. Yogita Gigra.
---

Q: What is Dr. Singh's email?
A: Dr. Sandeep Singh’s email address is sandeepsingh@ncuindia.edu.
---

Q: Where is the library located?
A: I couldn't find relevant information in the documents.
---

Q: When are the semester exams?
A: This Word is the dataset for the Final Year project. In this dataset we are taking the First year courses and their faculty members details. The labs room location in the campus. Add on to it we are taking info of Clubs, Events and of library. As this is the project for first year students therefore, we are also having the data for the Aptitute exam preparation. First comes, the first-year course and the faculty members for that: Course Course Code Facalty ID Room No. Community Service CSS101 Ms...
---


In [130]:
| Stage               | Methods                          | Online Tools/Models               | Offline Alternatives               |
|---------------------|----------------------------------|------------------------------------|------------------------------------|
| **Data Collection** | PDF/Q&A text extraction          | `PyPDFLoader`, `TextLoader`        | `pdfplumber`, `PyMuPDF`            |
| **Preprocessing**   | Text chunking, context preservation | `RecursiveCharacterTextSplitter`   | `NLTK`, `spaCy` sentence splitting |
| **Vector Database** | Embedding + semantic search      | `all-MiniLM-L6-v2` + `FAISS`       | `FastText` + `Annoy`               |
| **LLM Integration** | Context-aware generation         | **Online**:<br>- `GPT-3.5`<br>- `Llama-2-7b`<br>**T5**:<br>- `flan-t5-large` | **Offline**:<br>- `flan-t5-base` (quantized)<br>- `Zephyr-7b` (4-bit) |
| **QA System**       | Hybrid retrieval + prompting     | `RetrievalQA` (LangChain)          | TF-IDF + rule-based matching       |
| **Aptitude Module** | Question storage + test generation | `SQLite`                           | `JSON`/`CSV` files                 |
| **Chat Interface**  | Real-time interaction            | `Gradio`/`Streamlit`               | `Tkinter` (local GUI)              |
| **Deployment**      | Containerization + hosting       | `Docker` + `Hugging Face Spaces`   | Local server (`FastAPI`)            |
| **Monitoring**      | Interaction logging              | `CSV`/`SQLite` + `Weights & Biases`| Manual logging                     |

### **Model Specs**
| Model               | Type       | Hardware  | Use Case                     |
|---------------------|------------|-----------|------------------------------|
| `flan-t5-small`     | T5 (offline) | CPU       | Basic Q&A                    |
| `flan-t5-base`      | T5 (offline) | CPU/GPU   | Balanced speed/accuracy       |
| `Zephyr-7b`         | LLM (offline)| GPU       | High-accuracy local deployment|
| `Llama-2-7b`        | LLM (offline)| GPU       | Privacy-focused apps          |
| `GPT-3.5`           | LLM (online) | API       | Production deployments        |

SyntaxError: invalid syntax (2748382321.py, line 1)

| Stage               | Methods                          | Online Tools/Models               | Offline Alternatives               |
|---------------------|----------------------------------|------------------------------------|------------------------------------|
| **Data Collection** | PDF/Q&A text extraction          | `PyPDFLoader`, `TextLoader`        | `pdfplumber`, `PyMuPDF`            |
| **Preprocessing**   | Text chunking                    | `RecursiveCharacterTextSplitter`   | `NLTK`, `spaCy`                    |
| **Vector Database** | Embedding + semantic search      | `all-MiniLM-L6-v2` + `FAISS`       | `FastText` + `Annoy`               |
| **LLM Integration** | Context-aware generation         | `GPT-3.5`, `Llama-2-7b`            | `flan-t5-base`, `Zephyr-7b`        |
| **QA System**       | Hybrid retrieval                 | `RetrievalQA` (LangChain)          | TF-IDF + rule-based                |
| **Aptitude Module** | Question storage                 | `SQLite`                           | `JSON`/`CSV` files                 |
| **Chat Interface**  | Real-time interaction            | `Gradio`/`Streamlit`               | `Tkinter`                          |
| **Deployment**      | Containerization                 | `Docker` + `Hugging Face Spaces`   | `FastAPI` (local)                  |