## 1. Set up

In [1]:
import os
import json
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# PDF Processing
import PyPDF2
import fitz  # PyMuPDF for better text extraction
import re

# Data Processing
import pandas as pd
import numpy as np

# OpenAI Integration
import openai
from openai import OpenAI

# Vector Database (we'll use Chroma for local development)
import chromadb
from chromadb.config import Settings

# Embeddings
from sentence_transformers import SentenceTransformer

# Utilities
from datetime import datetime
import hashlib

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [10]:
os.getenv('OPENAI_API_KEY')

In [12]:
load_dotenv()

# Configuration and Setup
class Config:
    """Configuration class for the research assistant pipeline"""
    
    def __init__(self):
        # API Configuration
        self.openai_api_key = os.getenv('OPENAI_API_KEY')
        self.openai_model = "gpt-4o"  
        
        # File paths
        self.files_dir = Path("../files")
        self.output_dir = Path("../output")
        self.cache_dir = Path("../cache")
        
        # Vector Database
        self.vector_db_path = Path("../vector_db")
        self.embedding_model_name = "all-MiniLM-L6-v2"  # Fast and effective
        
        # Processing settings
        self.max_chunk_size = 1000  # characters per chunk
        self.chunk_overlap = 200    # characters overlap between chunks
        
        # Create directories if they don't exist
        self._create_directories()
    
    def _create_directories(self):
        """Create necessary directories"""
        for directory in [self.output_dir, self.cache_dir, self.vector_db_path]:
            directory.mkdir(exist_ok=True)
            logger.info(f"Directory created/verified: {directory}")
    
    def validate_setup(self):
        """Validate that all required components are properly configured"""
        issues = []
        
        # Check OpenAI API key
        if not self.openai_api_key:
            issues.append("OpenAI API key not found. Please set OPENAI_API_KEY in your .env file.")
        
        # Check if files directory exists
        if not self.files_dir.exists():
            issues.append(f"Files directory not found: {self.files_dir}")
        
        # Check if we can access OpenAI
        if self.openai_api_key:
            try:
                client = OpenAI(api_key=self.openai_api_key)
                # Simple test call
                response = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[{"role": "user", "content": "Hello"}],
                    max_tokens=5
                )
                logger.info("✅ OpenAI API connection successful")
            except Exception as e:
                issues.append(f"OpenAI API connection failed: {str(e)}")
        
        return issues

# Initialize configuration
config = Config()
print("🔧 Configuration initialized")

# Validate setup
issues = config.validate_setup()
if issues:
    print("❌ Setup issues found:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("✅ Setup validation passed!")

2025-06-19 12:13:00,468 - INFO - Directory created/verified: ..\output
2025-06-19 12:13:00,470 - INFO - Directory created/verified: ..\cache
2025-06-19 12:13:00,475 - INFO - Directory created/verified: ..\vector_db


🔧 Configuration initialized


2025-06-19 12:13:02,367 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-19 12:13:02,383 - INFO - ✅ OpenAI API connection successful


✅ Setup validation passed!


## 2. PDF Processing

In [6]:
# PDF Processing Utilities
class PDFProcessor:
    """Handles PDF text extraction and processing"""
    
    def __init__(self, config: Config):
        self.config = config
    
    def extract_text_from_pdf(self, pdf_path: Path) -> Dict[str, Any]:
        """
        Extract text from PDF with page information and metadata
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Dictionary containing text, pages, and metadata
        """
        try:
            # Use PyMuPDF for better text extraction
            doc = fitz.open(pdf_path)
            
            total_pages = len(doc)
            
            extracted_data = {
                'file_path': str(pdf_path),
                'file_name': pdf_path.name,
                'total_pages': len(doc),
                'pages': [],
                'full_text': "",
                'metadata': doc.metadata,
                'extraction_timestamp': datetime.now().isoformat()
            }
            
            # Extract text from each page
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text()
                
                page_data = {
                    'page_number': page_num + 1,
                    'text': text,
                    'text_length': len(text)
                }
                
                extracted_data['pages'].append(page_data)
                extracted_data['full_text'] += f"\n--- Page {page_num + 1} ---\n{text}"
            
            doc.close()
            logger.info(f"Successfully extracted text from {pdf_path.name} ({total_pages} pages)")
            return extracted_data
            
        except Exception as e:
            logger.error(f"Error extracting text from {pdf_path}: {str(e)}")
            raise
    
    def detect_titles_and_sections(self, text: str) -> List[Dict[str, Any]]:
        """
        Detect titles and sections in the text using regex patterns
        
        Args:
            text: Full text of the document
            
        Returns:
            List of detected sections with their titles and content
        """
        # Common title patterns
        title_patterns = [
            r'^(\d+\.\s+[A-Z][^.\n]+)',  # 1. Title
            r'^([A-Z][A-Z\s]{3,}[A-Z])',  # ALL CAPS TITLES
            r'^(\d+\.\d+\s+[A-Z][^.\n]+)',  # 1.1. Subtitle
            r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s*:)',  # Title:
        ]
        
        lines = text.split('\n')
        sections = []
        current_section = None
        
        for line_num, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue
            
            # Check if line matches any title pattern
            is_title = False
            title_level = 0
            
            for pattern in title_patterns:
                match = re.match(pattern, line)
                if match:
                    is_title = True
                    title_level = len(pattern.split('\\d+')) - 1  # Rough level estimation
                    break
            
            if is_title:
                # Save previous section if exists
                if current_section:
                    sections.append(current_section)
                
                # Start new section
                current_section = {
                    'title': line,
                    'title_level': title_level,
                    'content': line + '\n',
                    'start_line': line_num,
                    'end_line': line_num
                }
            elif current_section:
                # Add line to current section
                current_section['content'] += line + '\n'
                current_section['end_line'] = line_num
        
        # Add the last section
        if current_section:
            sections.append(current_section)
        
        # If no sections detected, create one section with all content
        if not sections:
            sections = [{
                'title': 'Document Content',
                'title_level': 0,
                'content': text,
                'start_line': 0,
                'end_line': len(lines) - 1
            }]
        
        logger.info(f"Detected {len(sections)} sections in the document")
        return sections
    
    def create_chunks(self, text: str, max_size: int = None, overlap: int = None) -> List[str]:
        """
        Create overlapping chunks from text for embedding
        
        Args:
            text: Text to chunk
            max_size: Maximum chunk size in characters
            overlap: Overlap size in characters
            
        Returns:
            List of text chunks
        """
        max_size = max_size or self.config.max_chunk_size
        overlap = overlap or self.config.chunk_overlap
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + max_size
            
            # Try to break at sentence boundary
            if end < len(text):
                # Look for sentence endings
                for i in range(end, max(start, end - 100), -1):
                    if text[i] in '.!?':
                        end = i + 1
                        break
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            start = end - overlap
            if start >= len(text):
                break
        
        logger.info(f"Created {len(chunks)} chunks from text")
        return chunks

# Initialize PDF processor
pdf_processor = PDFProcessor(config)
print("📄 PDF Processor initialized")

📄 PDF Processor initialized


In [7]:
# Test the PDF processing with your extracted pages
def test_pdf_processing():
    """Test the PDF processing pipeline with the extracted pages"""
    
    # Path to the extracted pages PDF
    extracted_pdf_path = config.files_dir / "extracted_pages_134_149.pdf"
    
    if not extracted_pdf_path.exists():
        print(f"❌ Extracted PDF not found: {extracted_pdf_path}")
        print("Please run the extract_pdf_pages.py script first to create the extracted pages.")
        return None
    
    print(f"📖 Processing: {extracted_pdf_path}")
    
    # Extract text
    extracted_data = pdf_processor.extract_text_from_pdf(extracted_pdf_path)
    
    print(f"✅ Extracted {extracted_data['total_pages']} pages")
    print(f"�� Total text length: {len(extracted_data['full_text'])} characters")
    
    # Detect sections
    sections = pdf_processor.detect_titles_and_sections(extracted_data['full_text'])
    
    print(f"📋 Detected {len(sections)} sections:")
    for i, section in enumerate(sections[:5]):  # Show first 5 sections
        print(f"  {i+1}. {section['title'][:50]}...")
    
    # Create chunks
    chunks = pdf_processor.create_chunks(extracted_data['full_text'])
    
    print(f"🔗 Created {len(chunks)} chunks for embedding")
    
    return {
        'extracted_data': extracted_data,
        'sections': sections,
        'chunks': chunks
    }

# Run the test
test_results = test_pdf_processing()

2025-06-19 12:06:34,340 - INFO - Successfully extracted text from extracted_pages_134_149.pdf (16 pages)
2025-06-19 12:06:34,348 - INFO - Detected 86 sections in the document
2025-06-19 12:06:34,350 - INFO - Created 59 chunks from text


📖 Processing: ..\files\extracted_pages_134_149.pdf
✅ Extracted 16 pages
�� Total text length: 44942 characters
📋 Detected 86 sections:
  1. Keywords: Large Language Models · ChatGPT · Induct...
  2. 1. Assistance...
  3. 2. Encouragement...
  4. 3. Checking in/concern...
  5. 4. Comfort/consolation...
🔗 Created 59 chunks for embedding


## 3. Vector DataBase

In [8]:
# Vector Database Setup
class VectorDatabase:
    """Handles vector database operations for document embeddings"""
    
    def __init__(self, config: Config):
        self.config = config
        self.client = None
        self.collection = None
        self.embedding_model = None
        self._initialize()
    
    def _initialize(self):
        """Initialize the vector database and embedding model"""
        try:
            # Initialize ChromaDB
            self.client = chromadb.PersistentClient(
                path=str(self.config.vector_db_path),
                settings=Settings(anonymized_telemetry=False)
            )
            
            # Initialize embedding model
            self.embedding_model = SentenceTransformer(self.config.embedding_model_name)
            
            # Create or get collection
            self.collection = self.client.get_or_create_collection(
                name="research_documents",
                metadata={"description": "Research document embeddings"}
            )
            
            logger.info("✅ Vector database initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing vector database: {str(e)}")
            raise
    
    def add_documents(self, documents: List[Dict[str, Any]], collection_name: str = None) -> bool:
        """
        Add documents to the vector database
        
        Args:
            documents: List of document dictionaries with 'text' and 'metadata' keys
            collection_name: Optional collection name
            
        Returns:
            Success status
        """
        try:
            if not documents:
                logger.warning("No documents to add")
                return False
            
            # Prepare documents for embedding
            texts = [doc['text'] for doc in documents]
            metadatas = [doc.get('metadata', {}) for doc in documents]
            ids = [doc.get('id', f"doc_{i}") for i, doc in enumerate(documents)]
            
            # Generate embeddings
            embeddings = self.embedding_model.encode(texts).tolist()
            
            # Add to collection
            collection = self.collection if collection_name is None else self.client.get_or_create_collection(collection_name)
            collection.add(
                embeddings=embeddings,
                documents=texts,
                metadatas=metadatas,
                ids=ids
            )
            
            logger.info(f"✅ Added {len(documents)} documents to vector database")
            return True
            
        except Exception as e:
            logger.error(f"Error adding documents to vector database: {str(e)}")
            return False
    
    def search_similar(self, query: str, n_results: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar documents
        
        Args:
            query: Search query
            n_results: Number of results to return
            
        Returns:
            List of similar documents with scores
        """
        try:
            # Generate query embedding
            query_embedding = self.embedding_model.encode([query]).tolist()
            
            # Search in collection
            results = self.collection.query(
                query_embeddings=query_embedding,
                n_results=n_results
            )
            
            # Format results
            formatted_results = []
            for i in range(len(results['documents'][0])):
                formatted_results.append({
                    'document': results['documents'][0][i],
                    'metadata': results['metadatas'][0][i],
                    'distance': results['distances'][0][i],
                    'id': results['ids'][0][i]
                })
            
            return formatted_results
            
        except Exception as e:
            logger.error(f"Error searching vector database: {str(e)}")
            return []

# Initialize vector database
vector_db = VectorDatabase(config)
print("🗄️ Vector Database initialized")

2025-06-19 12:08:48,101 - INFO - Use pytorch device_name: cpu
2025-06-19 12:08:48,101 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-19 12:08:53,513 - INFO - ✅ Vector database initialized successfully


🗄️ Vector Database initialized


## 4. OpenAI Integration

In [13]:
# OpenAI Integration Setup
class OpenAIClient:
    """Handles OpenAI API interactions for document processing"""
    
    def __init__(self, config: Config):
        self.config = config
        self.client = OpenAI(api_key=config.openai_api_key)
    
    def extract_information(self, text: str, schema: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract structured information from text using OpenAI
        
        Args:
            text: Text to analyze
            schema: Schema defining what to extract
            
        Returns:
            Extracted information in structured format
        """
        try:
            # Create prompt for information extraction
            schema_description = json.dumps(schema, indent=2)
            
            prompt = f"""
            You are an expert research assistant specializing in qualitative research methodology and computational text analysis. Your task is to perform systematic data extraction from academic literature focusing on deductive qualitative analysis pipelines for interview data. Extract information with precision and technical accuracy, maintaining methodological rigor throughout the analysis.
            
            # Task Definition
            Analyze the provided research document and extract structured information related to deductive qualitative analysis methodologies, computational workflows, and evaluation frameworks. Focus on identifying technical specifications, methodological approaches, and empirical findings relevant to automated or semi-automated qualitative data analysis pipelines.
            
            # Extraction Schema
            Extract the following information and return it as a structured JSON object with the specified keys:
            
            # Bibliographic Metadata

            paper_id: Generate a unique identifier (format: YYYY_AuthorLastName_KeywordAbbrev)
            title: Complete paper title
            authors: Array of author names
            publication_year: Year of publication
            venue: Journal name or conference proceedings
            doi_url: Digital Object Identifier or URL
            research_domain: Primary research field or application domain

            # Methodological Framework

            analysis_type: Classification of analytical approach (deductive, inductive, abductive, mixed-methods)
            theoretical_framework: Underlying theoretical model or conceptual framework guiding the deductive approach
            sample_characteristics: Object containing:

            interview_count: Number of interviews analyzed
            participant_demographics: Demographic composition of study participants
            data_collection_method: Interview format (structured, semi-structured, unstructured, focus_groups)



            # Technical Pipeline Architecture

            automation_level: Degree of computational automation (manual, semi_automated, fully_automated, hybrid)
            software_tools: Array of software platforms, libraries, or frameworks utilized
            computational_methods: Array of AI/ML techniques employed (llm_based, traditional_nlp, rule_based, statistical_methods)
            workflow_architecture: Object containing:

            preprocessing_steps: Data preparation and cleaning procedures
            analysis_pipeline: Sequential processing stages
            postprocessing_steps: Output refinement and validation procedures


            coding_framework: Predetermined coding scheme or categorization system

            # Prompt Engineering Specifications

            prompting_strategy: Object containing:

            approach_type: Prompting methodology (zero_shot, few_shot, chain_of_thought, tree_of_thought, role_based)
            prompt_examples: Array of actual prompt templates or examples (if available)
            engineering_techniques: Prompt optimization strategies employed
            model_specifications: LLM or AI model details (model_name, version, parameters)


            # Empirical Results and Evaluation

            primary_findings: Array of key thematic discoveries or pattern identifications
            evaluation_framework: Object containing:

            metrics_employed: Quantitative evaluation measures (inter_rater_reliability, precision, recall, f1_score, kappa_statistic)
            validation_methodology: Validation approach (expert_review, ground_truth_comparison, cross_validation)
            performance_indicators: Numerical results or performance benchmarks


            methodological_limitations: Acknowledged constraints or methodological shortcomings

            # Quality Assurance and Reliability

            reliability_measures: Statistical measures of consistency (cohens_kappa, fleiss_kappa, percentage_agreement, cronbach_alpha)
            validity_approaches: Validity enhancement strategies (member_checking, triangulation, peer_debriefing, audit_trail)
            bias_mitigation_strategies: Approaches to minimize analytical bias or systematic errors

            # Research Contributions and Impact

            novel_contributions: Methodological innovations or technical advances
            practical_applications: Real-world implementation scenarios or use cases
            future_research_directions: Suggested extensions or research opportunities
            scalability_considerations: Discussion of scalability to larger datasets or different contexts

            Output Format Requirements
            Return a valid JSON object with the following structure:
            json
            Extraction Guidelines

            Completeness: Extract all available information; use "not_specified" for missing data
            Precision: Maintain technical terminology and methodological specificity
            Contextual Accuracy: Preserve the original meaning and technical context
            Standardization: Use consistent terminology across extractions
            Null Handling: Use empty arrays [] for missing list items, null for missing values

            Quality Control Instructions

            Verify technical terminology accuracy
            Cross-reference methodological claims with reported procedures
            Distinguish between claimed capabilities and empirically demonstrated results
            Flag any ambiguous or contradictory information in the source document
            Prioritize explicit methodological descriptions over implicit assumptions

            Begin extraction analysis upon document provision.
            
            {
                {
            "bibliographic_metadata": {
                "paper_id": "string",
                "title": "string",
                "authors": ["string"],
                "publication_year": "integer",
                "venue": "string",
                "doi_url": "string",
                "research_domain": "string"
            },
            "methodological_framework": {
                "analysis_type": "string",
                "theoretical_framework": "string",
                "sample_characteristics": {
                "interview_count": "integer",
                "participant_demographics": "string",
                "data_collection_method": "string"
                }
            },
            "technical_pipeline": {
                "automation_level": "string",
                "software_tools": ["string"],
                "computational_methods": ["string"],
                "workflow_architecture": {
                "preprocessing_steps": ["string"],
                "analysis_pipeline": ["string"],
                "postprocessing_steps": ["string"]
                },
                "coding_framework": "string"
            },
            "prompt_engineering": {
                "prompting_strategy": {
                "approach_type": "string",
                "prompt_examples": ["string"],
                "engineering_techniques": ["string"],
                "model_specifications": "string"
                }
            },
            "empirical_results": {
                "primary_findings": ["string"],
                "evaluation_framework": {
                "metrics_employed": ["string"],
                "validation_methodology": "string",
                "performance_indicators": "string"
                },
                "methodological_limitations": ["string"]
            },
            "quality_assurance": {
                "reliability_measures": ["string"],
                "validity_approaches": ["string"],
                "bias_mitigation_strategies": ["string"]
            },
            "research_impact": {
                "novel_contributions": ["string"],
                "practical_applications": ["string"],
                "future_research_directions": ["string"],
                "scalability_considerations": "string"
            }
            }
            }
            
            Text to analyze:
            {text[:4000]}  # Limit text length for API efficiency
            
            Please return the extracted information in valid JSON format matching the schema.
            """
            
            response = self.client.chat.completions.create(
                model=self.config.openai_model,
                messages=[
                    {"role": "system", "content": "You are a research assistant that extracts structured information from academic documents. Always respond with valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,  # Low temperature for consistent extraction
                max_tokens=10000
            )
            
            # Parse response
            extracted_data = json.loads(response.choices[0].message.content)
            logger.info("✅ Information extraction completed")
            return extracted_data
            
        except Exception as e:
            logger.error(f"Error extracting information: {str(e)}")
            return {}
    
    def answer_question(self, question: str, context: str) -> str:
        """
        Answer a question based on provided context
        
        Args:
            question: User question
            context: Relevant context from documents
            
        Returns:
            Answer to the question
        """
        try:
            prompt = f"""
            Based on the following context, please answer the question. If the context doesn't contain enough information to answer the question, say so.
            
            Context:
            {context}
            
            Question: {question}
            
            Answer:
            """
            
            response = self.client.chat.completions.create(
                model=self.config.openai_model,
                messages=[
                    {"role": "system", "content": "You are a helpful research assistant. Answer questions based on the provided context."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=500
            )
            
            return response.choices[0].message.content
            
        except Exception as e:
            logger.error(f"Error answering question: {str(e)}")
            return "Sorry, I encountered an error while processing your question."

# Initialize OpenAI client
openai_client = OpenAIClient(config)
print("🤖 OpenAI Client initialized")

🤖 OpenAI Client initialized


## 5. Main Pipeline

In [None]:
# Main Pipeline Class
class ResearchAssistantPipeline:
    """Main pipeline that orchestrates document processing and AI interactions"""
    
    def __init__(self, config: Config):
        self.config = config
        self.pdf_processor = PDFProcessor(config)
        self.vector_db = VectorDatabase(config)
        self.openai_client = OpenAIClient(config)
    
    def process_document(self, pdf_path: Path) -> Dict[str, Any]:
        """
        Process a document through the complete pipeline
        
        Args:
            pdf_path: Path to the PDF document
            
        Returns:
            Processing results
        """
        try:
            logger.info(f"🚀 Starting pipeline for: {pdf_path.name}")
            
            # Step 1: Extract text from PDF
            extracted_data = self.pdf_processor.extract_text_from_pdf(pdf_path)
            
            # Step 2: Detect sections
            sections = self.pdf_processor.detect_titles_and_sections(extracted_data['full_text'])
            
            # Step 3: Create chunks for embedding
            chunks = self.pdf_processor.create_chunks(extracted_data['full_text'])
            
            # Step 4: Prepare documents for vector database
            documents_for_embedding = []
            for i, chunk in enumerate(chunks):
                doc_id = f"{pdf_path.stem}_chunk_{i}"
                documents_for_embedding.append({
                    'id': doc_id,
                    'text': chunk,
                    'metadata': {
                        'source_file': pdf_path.name,
                        'chunk_index': i,
                        'total_chunks': len(chunks),
                        'processing_timestamp': datetime.now().isoformat()
                    }
                })
            
            # Step 5: Add to vector database
            self.vector_db.add_documents(documents_for_embedding)
            
            # Step 6: Extract structured information (placeholder for now)
            # We'll implement this when we define the schema
            
            results = {
                'file_name': pdf_path.name,
                'total_pages': extracted_data['total_pages'],
                'sections_detected': len(sections),
                'chunks_created': len(chunks),
                'embeddings_stored': len(documents_for_embedding),
                'processing_timestamp': datetime.now().isoformat()
            }
            
            logger.info(f"✅ Pipeline completed for {pdf_path.name}")
            return results
            
        except Exception as e:
            logger.error(f"Error in pipeline: {str(e)}")
            raise
    
    def ask_question(self, question: str, n_context_chunks: int = 3) -> str:
        """
        Ask a question about the processed documents
        
        Args:
            question: User question
            n_context_chunks: Number of context chunks to retrieve
            
        Returns:
            Answer to the question
        """
        try:
            # Search for relevant context
            similar_docs = self.vector_db.search_similar(question, n_context_chunks)
            
            if not similar_docs:
                return "I couldn't find any relevant information in the processed documents."
            
            # Combine context
            context = "\n\n".join([doc['document'] for doc in similar_docs])
            
            # Get answer from OpenAI
            answer = self.openai_client.answer_question(question, context)
            
            return answer
            
        except Exception as e:
            logger.error(f"Error answering question: {str(e)}")
            return "Sorry, I encountered an error while processing your question."

# Initialize the main pipeline
pipeline = ResearchAssistantPipeline(config)
print("🎯 Research Assistant Pipeline initialized")