In [1]:
#Step 1: Environment Setup - Complete Implementation

In [2]:
#Check Python Environment

In [3]:
import sys
import platform

print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.architecture()}")

# Check if we're in a virtual environment (recommended)
if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
    print("✓ Running in virtual environment")
else:
    print("⚠️  Consider using a virtual environment for this project")

Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Platform: Windows-11-10.0.26100-SP0
Architecture: ('64bit', 'WindowsPE')
⚠️  Consider using a virtual environment for this project


In [4]:
#Install Core Packages

In [5]:
# Install all required packages for the RAG system
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")

# Core packages for RAG system
packages = [
    "sentence-transformers",  # For local embeddings
    "faiss-cpu",             # Vector database (CPU version)
    "chromadb",              # Alternative vector database
    "pandas",                # Data manipulation
    "numpy",                 # Numerical operations
    "scikit-learn",          # ML utilities
    "nltk",                  # Natural language processing
    "tqdm",                  # Progress bars
    "matplotlib",            # Plotting
    "seaborn",               # Better plotting
    "jupyter",               # Jupyter notebook support
    "ipywidgets",            # Interactive widgets
]

print("Installing packages for RAG system...")
for package in packages:
    install_package(package)

Installing packages for RAG system...
✓ Successfully installed sentence-transformers
✓ Successfully installed faiss-cpu
✓ Successfully installed chromadb
✓ Successfully installed pandas
✓ Successfully installed numpy
✓ Successfully installed scikit-learn
✓ Successfully installed nltk
✓ Successfully installed tqdm
✓ Successfully installed matplotlib
✓ Successfully installed seaborn
✓ Successfully installed jupyter
✓ Successfully installed ipywidgets


In [7]:
#Install Optional Packages (GPU Support)

In [8]:
# Optional: Install GPU-accelerated packages if you have CUDA
import torch

def check_gpu_support():
    if torch.cuda.is_available():
        print(f"✓ CUDA available: {torch.cuda.get_device_name()}")
        return True
    else:
        print("ℹ️  CUDA not available, using CPU versions")
        return False

has_gpu = check_gpu_support()

# Install GPU versions if available
if has_gpu:
    gpu_packages = [
        "faiss-gpu",  # GPU version of FAISS (will replace faiss-cpu)
    ]
    
    print("\nInstalling GPU-accelerated packages...")
    for package in gpu_packages:
        install_package(package)

ℹ️  CUDA not available, using CPU versions


In [9]:
#Download and Setup NLTK Data

In [10]:
# Download required NLTK data for text processing
import nltk

nltk_downloads = [
    'punkt',        # Sentence tokenizer
    'stopwords',    # Stop words
    'wordnet',      # WordNet lemmatizer
    'averaged_perceptron_tagger',  # POS tagger
]

print("Downloading NLTK data...")
for item in nltk_downloads:
    try:
        nltk.download(item, quiet=True)
        print(f"✓ Downloaded {item}")
    except Exception as e:
        print(f"✗ Failed to download {item}: {e}")

Downloading NLTK data...
✓ Downloaded punkt
✓ Downloaded stopwords
✓ Downloaded wordnet
✓ Downloaded averaged_perceptron_tagger


In [11]:
# Import and Test All Libraries

In [12]:
# Import all libraries and test they work correctly
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Core imports
try:
    import pandas as pd
    import numpy as np
    import json
    import os
    import re
    from pathlib import Path
    from typing import List, Dict, Tuple, Optional
    print("✓ Core libraries imported")
except ImportError as e:
    print(f"✗ Core import error: {e}")

# ML and NLP imports
try:
    from sentence_transformers import SentenceTransformer
    import faiss
    import chromadb
    from sklearn.metrics.pairwise import cosine_similarity
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    print("✓ ML/NLP libraries imported")
except ImportError as e:
    print(f"✗ ML/NLP import error: {e}")

# Utility imports
try:
    from tqdm.notebook import tqdm  # Progress bars for Jupyter
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✓ Utility libraries imported")
except ImportError as e:
    print(f"✗ Utility import error: {e}")

✓ Core libraries imported
✓ ML/NLP libraries imported
✓ Utility libraries imported


In [13]:
#System Resource Check

In [15]:
# Check system resources for optimal configuration
import psutil
import gc
from pathlib import Path
import shutil

def _pretty_gb(x): 
    return f"{x/(1024**3):.2f} GB"

def safe_disk_usage(path="."):
    """Robust disk usage that avoids psutil edge cases on Windows."""
    try:
        # Try psutil with the resolved path first
        return psutil.disk_usage(str(Path(path).resolve()))
    except Exception:
        # Fall back to drive root (Windows) or "/" (POSIX)
        anchor = Path(path).resolve().anchor or (Path(path).anchor or "/")
        try:
            return psutil.disk_usage(anchor)
        except Exception:
            # Final fallback: shutil (cross-platform)
            total, used, free = shutil.disk_usage(anchor or "/")
            # Mimic psutil result
            from collections import namedtuple
            DiskUsage = namedtuple("sdiskusage", ["total", "used", "free", "percent"])
            percent = (used / total * 100) if total else 0.0
            return DiskUsage(total, used, free, percent)

def check_system_resources():
    # Memory info
    memory = psutil.virtual_memory()
    print(f"Total RAM: {_pretty_gb(memory.total)}")
    print(f"Available RAM: {_pretty_gb(memory.available)}")
    print(f"RAM Usage: {memory.percent}%")
    
    # CPU info
    print(f"CPU Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
    
    # Disk space (robust)
    disk = safe_disk_usage(".")
    print(f"Disk Space: {_pretty_gb(disk.free)} free of {_pretty_gb(disk.total)} ({disk.percent:.1f}% used)")
    
    # GPU info
    try:
        import torch
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            print(f"GPUs available: {gpu_count}")
            for i in range(gpu_count):
                props = torch.cuda.get_device_properties(i)
                print(f"  GPU {i}: {props.name} ({_pretty_gb(props.total_memory)})")
        else:
            print("No GPU available")
    except Exception:
        print("PyTorch not available for GPU check")

check_system_resources()

# Recommend configuration based on resources
def recommend_config():
    memory_gb = psutil.virtual_memory().total / (1024**3)
    print("\n=== Recommended Configuration ===")
    if memory_gb >= 32:
        print("High-end setup: Use large embedding models and ChromaDB")
        return {"embedding_model": "all-mpnet-base-v2", "vector_db": "chromadb", "batch_size": 100}
    elif memory_gb >= 16:
        print("Mid-range setup: Use medium embedding models and FAISS")
        return {"embedding_model": "all-MiniLM-L6-v2", "vector_db": "faiss", "batch_size": 50}
    elif memory_gb >= 8:
        print("Budget setup: Use lightweight models and small batches")
        return {"embedding_model": "all-MiniLM-L6-v2", "vector_db": "faiss", "batch_size": 25}
    else:
        print("Low memory: Use very lightweight setup")
        return {"embedding_model": "paraphrase-MiniLM-L3-v2", "vector_db": "faiss", "batch_size": 10}

config = recommend_config()


Total RAM: 15.86 GB
Available RAM: 3.98 GB
RAM Usage: 74.9%
CPU Cores: 4 physical, 8 logical
Disk Space: 13.02 GB free of 236.95 GB (94.5% used)
No GPU available

=== Recommended Configuration ===
Budget setup: Use lightweight models and small batches


In [16]:
#Create Project Directory Structure

In [17]:
# Create organized directory structure for your RAG project
import os
from pathlib import Path

def create_project_structure():
    directories = [
        "data/raw",           # Your original text files and JSON
        "data/processed",     # Cleaned and chunked data
        "data/embeddings",    # Saved embeddings
        "models",            # Downloaded models
        "vector_db",         # Vector database files
        "outputs",           # Query results and logs
        "notebooks",         # Additional notebooks
        "utils",             # Helper functions
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
        print(f"✓ Created directory: {directory}")
    
    # Create .gitignore for the project
    gitignore_content = """
# Models and embeddings (too large for git)
models/
data/embeddings/
vector_db/

# Jupyter notebook checkpoints
.ipynb_checkpoints/

# Python cache
__pycache__/
*.pyc

# Environment variables
.env

# Large data files
*.bin
*.gguf
*.model
"""
    
    with open('.gitignore', 'w') as f:
        f.write(gitignore_content)
    print("✓ Created .gitignore file")

create_project_structure()

✓ Created directory: data/raw
✓ Created directory: data/processed
✓ Created directory: data/embeddings
✓ Created directory: models
✓ Created directory: vector_db
✓ Created directory: outputs
✓ Created directory: notebooks
✓ Created directory: utils
✓ Created .gitignore file
