In [None]:
!pip install huggingface_hub

In [None]:
!huggingface-cli login

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
# Optimized RAG Ingestion Script (Batch Embedding + Chunk Truncation)

import subprocess
import sys

def install_if_missing(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = [
    "sentence_transformers",
    "chromadb",
    "pandas",
    "tqdm"
]

for pkg in required_packages:
    install_if_missing(pkg)

from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import gc

# Mount Google Drive if in Colab
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
except ImportError:
    pass

CHROMA_PATH = "/content/chroma_local_tmp"
Path(CHROMA_PATH).mkdir(parents=True, exist_ok=True)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
client = PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection("csv_documents")

def ingest_csv_optimized(filepath, chunk_size=1000, truncate_len=3000):
    try:
        df = pd.read_csv(filepath, low_memory=False)
        total_rows = len(df)
        print(f"\n📄 Processing: {filepath.name} ({total_rows} rows)")

        texts, metadatas, ids = [], [], []
        for start in tqdm(range(0, total_rows, chunk_size), desc=f"Chunks in {filepath.name}"):
            chunk = df.iloc[start:start + chunk_size]
            chunk_text = chunk.to_string(index=False)
            if len(chunk_text) > truncate_len:
                chunk_text = chunk_text[:truncate_len]

            texts.append(chunk_text)
            metadatas.append({"filename": str(filepath), "start_row": int(start)})
            ids.append(f"{filepath.stem}_{start}")

        # batch embedding
        embeddings = embed_model.encode(texts, batch_size=8)

        collection.add(
            documents=texts,
            embeddings=[e.tolist() for e in embeddings],
            metadatas=metadatas,
            ids=ids
        )

        gc.collect()
        print(f"✅ Ingested {len(texts)} chunks from {filepath.name}")
        return len(texts)

    except Exception as e:
        print(f"❌ Error processing {filepath}: {e}")
        return 0

def process_csv_files(folder_path, chunk_size=1000):
    folder = Path(folder_path)
    csv_files = list(folder.glob("**/*.csv"))

    total_chunks = 0
    for filepath in csv_files:
        chunks = ingest_csv_optimized(filepath, chunk_size=chunk_size)
        total_chunks += chunks

    print(f"\n✅ Done: Ingested {total_chunks} chunks total from {len(csv_files)} CSV file(s).")

if __name__ == "__main__":
    process_csv_files("/content/drive/MyDrive/AI Chatbot Data", chunk_size=1000)


Mounted at /content/drive

📄 Processing: cleaned_glassdoor_reviews_large.new.csv (2012978 rows)


Chunks in cleaned_glassdoor_reviews_large.new.csv: 100%|██████████| 2013/2013 [04:56<00:00,  6.80it/s]


✅ Ingested 2013 chunks from cleaned_glassdoor_reviews_large.new.csv

✅ Done: Ingested 2013 chunks total from 1 CSV file(s).


In [None]:
# ai_recruiter_with_rag.py (hardcoded base prompt)

import subprocess
import sys
import importlib.util
import os

if os.path.exists("/content"):
    try:
        subprocess.check_call(["apt-get", "install", "-y", "libmagic1"])
    except Exception as e:
        print(f"⚠️ Failed to install apt dependency: {e}")

required = {
    "torch": "torch",
    "pandas": "pandas",
    "mammoth": "mammoth",
    "docx": "python-docx",
    "fitz": "PyMuPDF",
    "xlrd": "xlrd",
    "sentence_transformers": "sentence-transformers",
    "transformers": "transformers --upgrade",
    "gradio": "gradio",
    "bitsandbytes": "git+https://github.com/TimDettmers/bitsandbytes.git",
    "accelerate": "accelerate --upgrade",
    "chromadb": "chromadb",
    "magic": "python-magic"
}

def install_missing(pkg_map):
    for imp_name, pip_cmd in pkg_map.items():
        if importlib.util.find_spec(imp_name) is None:
            print(f"📦 Installing {pip_cmd}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install"] + pip_cmd.split())

install_missing(required)

import io
import time
import torch
import pandas as pd
import mammoth
import docx
import fitz
import xlrd
import magic
import numpy as np
from io import StringIO
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr
from chromadb import PersistentClient

try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass

CHROMA_PATH = "/content/drive/MyDrive/Glassdoor Chroma Store"
client = PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection("csv_documents")

model_path = "mistralai/Mistral-7B-Instruct-v0.2"
try:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )
    print("✅ Loaded model with 4-bit quantization (bnb)")
except Exception as e:
    print("⚠️ Failed to load with bitsandbytes, falling back to fp16")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_file(file):
    if file is None:
        return ""

    try:
        if hasattr(file, "read"):
            bytes_data = file.read()
        elif hasattr(file, "value") and os.path.exists(file.value):
            with open(file.value, "rb") as f:
                bytes_data = f.read()
        elif hasattr(file, "name") and os.path.exists(file.name):
            with open(file.name, "rb") as f:
                bytes_data = f.read()
        else:
            raise ValueError("Unsupported file object or missing file path.")
    except Exception as e:
        print(f"❌ Failed to read uploaded file: {e}")
        return ""

    mime_type = magic.from_buffer(bytes_data, mime=True)
    stream = io.BytesIO(bytes_data)

    try:
        if mime_type == "text/plain":
            return bytes_data.decode("utf-8", errors="ignore")[:3000]
        elif mime_type == "application/pdf":
            with fitz.open(stream=stream, filetype="pdf") as doc:
                return "\n".join(page.get_text() for page in doc)[:3000]
        elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            return "\n".join(p.text for p in docx.Document(stream).paragraphs)[:3000]
        elif mime_type == "application/msword":
            return mammoth.extract_raw_text(stream).value[:3000]
        elif mime_type == "text/csv":
            return pd.read_csv(StringIO(bytes_data.decode("utf-8", errors="ignore"))).to_string()[:3000]
        elif "excel" in mime_type:
            return pd.read_excel(stream).to_string()[:3000]
    except Exception as e:
        print(f"❌ Extraction error: {e}")
        return ""

    return ""

def qa_with_llm(file, question):
    base_prompt = """You are a seasoned, trustworthy talent‑acquisition specialist.
Using insights from the Glassdoor Job Reviews dataset, distill employee feedback to describe a company’s culture, work environment, benefits, challenges, and overall satisfaction.
When asked, compare firms, spotlight strengths and weaknesses, and advise job‑seekers on career moves.

Your audience is actively exploring new opportunities and relies on you for clear, unbiased, actionable guidance in plain, professional language."""

    if file:
        context = extract_text_from_file(file)
    else:
        query_embedding = embed_model.encode(question).tolist()
        results = collection.query(query_embeddings=[query_embedding], n_results=3)
        context = "\n\n".join(results['documents'][0]) if results and results['documents'] else ""

    if not question.strip():
        if not context:
            return "⚠️ Please upload a document or enter a question."
        question = f"What is the most relevant summary or insight based on the uploaded content?"

    prompt = f"{base_prompt}\n\nQuestion: {question}\n\nContext:\n{context[:1000]}\n\nAnswer:"

    tokenized = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096, padding=True)
    inputs = {k: v.to(model.device) for k, v in tokenized.items()}
    print(f"🧠 Prompt token count: {inputs['input_ids'].shape[1]}")

    try:
        start = time.time()
        output = model.generate(**inputs, max_new_tokens=512, temperature=0.7, do_sample=True)
        end = time.time()
        print(f"⏱️ Response time: {end - start:.2f} sec")
    except Exception as e:
        print(f"❌ Model generation failed: {e}")
        return "⚠️ An error occurred while generating the answer. Please try with a different file or question."

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    final_answer = answer.split("Answer:")[-1].strip()

    return final_answer

app = gr.Interface(
    fn=qa_with_llm,
    inputs=[
        gr.File(label="Upload Document"),
        gr.Textbox(label="Ask a question", placeholder="What are the top-rated companies for software engineers?")
    ],
    outputs="text",
    title="AI Recruiter Assistant",
    description="Upload a job-related document and/or ask the AI for recommendations or summaries."
)

app.launch(share=True, debug=True)


In [None]:
app.close()