In [7]:
# Pydantic Models
from pydantic import BaseModel
from typing import List, Optional, Union


class BibliographicMetadata(BaseModel):
    paper_id: str
    title: str
    authors: List[str]
    publication_year: int
    venue: str
    doi_url: str
    research_domain: str


class SampleCharacteristics(BaseModel):
    interview_count: Optional[Union[int, str]] = None
    participant_demographics: str
    data_collection_method: str


class MethodologicalFramework(BaseModel):
    analysis_type: str
    theoretical_framework: str
    sample_characteristics: SampleCharacteristics


class WorkflowArchitecture(BaseModel):
    preprocessing_steps: List[str]
    analysis_pipeline: List[str]
    postprocessing_steps: List[str]


class TechnicalPipeline(BaseModel):
    automation_level: str
    software_tools: List[str]
    computational_methods: List[str]
    workflow_architecture: WorkflowArchitecture
    coding_framework: str


class PromptingStrategy(BaseModel):
    approach_type: str
    prompt_examples: List[str]
    engineering_techniques: List[str]
    model_specifications: str


class PromptEngineering(BaseModel):
    prompting_strategy: PromptingStrategy


class EvaluationFramework(BaseModel):
    metrics_employed: List[str]
    validation_methodology: str
    performance_indicators: str


class EmpiricalResults(BaseModel):
    primary_findings: List[str]
    evaluation_framework: EvaluationFramework
    methodological_limitations: List[str]


class QualityAssurance(BaseModel):
    reliability_measures: List[str]
    validity_approaches: List[str]
    bias_mitigation_strategies: List[str]


class ResearchImpact(BaseModel):
    novel_contributions: List[str]
    practical_applications: List[str]
    future_research_directions: List[str]
    scalability_considerations: str


class ResearchStudy(BaseModel):
    bibliographic_metadata: BibliographicMetadata
    methodological_framework: MethodologicalFramework
    technical_pipeline: TechnicalPipeline
    prompt_engineering: PromptEngineering
    empirical_results: EmpiricalResults
    quality_assurance: QualityAssurance
    research_impact: ResearchImpact

class ResearchState(BaseModel):
    bibliographic_metadata: Optional[BibliographicMetadata] = None
    methodological_framework: Optional[MethodologicalFramework] = None
    technical_pipeline: Optional[TechnicalPipeline] = None
    prompt_engineering: Optional[PromptEngineering] = None
    empirical_results: Optional[EmpiricalResults] = None
    quality_assurance: Optional[QualityAssurance] = None
    research_impact: Optional[ResearchImpact] = None

In [8]:
# Extraction Schema

section_schemas = {
    "bibliographic_metadata": {
        "bibliographic_metadata": {
            "paper_id": "string",
            "title": "string",
            "authors": ["string"],
            "publication_year": "integer",
            "venue": "string",
            "doi_url": "string",
            "research_domain": "string"
        }
    },
    "methodological_framework": {
        "methodological_framework": {
            "analysis_type": "string",
            "theoretical_framework": "string",
            "sample_characteristics": {
                "interview_count": "integer",
                "participant_demographics": "string",
                "data_collection_method": "string"
            }
        }
    },
    "technical_pipeline": {
        "technical_pipeline": {
            "automation_level": "string",
            "software_tools": ["string"],
            "computational_methods": ["string"],
            "workflow_architecture": {
                "preprocessing_steps": ["string"],
                "analysis_pipeline": ["string"],
                "postprocessing_steps": ["string"]
            },
            "coding_framework": "string"
        }
    },
    "prompt_engineering": {
        "prompt_engineering": {
            "prompting_strategy": {
                "approach_type": "string",
                "prompt_examples": ["string"],
                "engineering_techniques": ["string"],
                "model_specifications": "string"
            }
        }
    },
    "empirical_results": {
        "empirical_results": {
            "primary_findings": ["string"],
            "evaluation_framework": {
                "metrics_employed": ["string"],
                "validation_methodology": "string",
                "performance_indicators": "string"
            },
            "methodological_limitations": ["string"]
        }
    },
    "quality_assurance": {
        "quality_assurance": {
            "reliability_measures": ["string"],
            "validity_approaches": ["string"],
            "bias_mitigation_strategies": ["string"]
        }
    },
    "research_impact": {
        "research_impact": {
            "novel_contributions": ["string"],
            "practical_applications": ["string"],
            "future_research_directions": ["string"],
            "scalability_considerations": "string"
        }
    }
}

In [9]:
template_base = """
You are an expert research assistant specializing in qualitative research methodology and computational text analysis. Your task is to perform systematic data extraction from academic literature focusing on deductive qualitative analysis pipelines for interview data.

# Task Definition
Analyze the provided research document and extract structured information related to deductive qualitative analysis methodologies, computational workflows, and evaluation frameworks. Focus on identifying technical specifications, methodological approaches, and empirical findings relevant to automated or semi-automated qualitative data analysis pipelines.

# Extraction Schema

Extract the following information and return it as a structured JSON object with the specified schema:
{schema}

Extraction Guidelines
- Completeness: Extract all available information; use "not_specified" for missing data
- Precision: Maintain technical terminology and methodological specificity
- Contextual Accuracy: Preserve the original meaning and technical context
- Standardization: Use consistent terminology across extractions
- Null Handling: Use empty arrays [] for missing list items, null for missing values

# IMPORTANT: Return ONLY valid JSON without any additional text, explanations, or markdown formatting.

# Texto a analizar
{document_text}
"""

In [10]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import json
from dotenv import load_dotenv

load_dotenv()

# Initialize OpenAI
llm = ChatOpenAI(model="gpt-4.1", temperature=0.002)

# 1. Define prompts for each section
prompts = {}
for section, schema in section_schemas.items():
    prompts[section] = PromptTemplate(
        input_variables=["document_text", "schema"],
        template=template_base
    )

# 2. Create LLMChain for each section
chains = {}
for section, prompt_template in prompts.items():
    chains[section] = LLMChain(
        llm=llm,
        prompt=prompt_template.partial(schema=json.dumps(section_schemas[section], indent=2)),
        output_key=section
    )

In [17]:
import os
import json
import fitz  # PyMuPDF
from pathlib import Path
import pandas as pd
from typing import Dict
from langchain.chains import LLMChain


def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


def run_extraction_pipeline(chains: Dict[str, LLMChain], output_dir="outputs"):
    os.makedirs(output_dir, exist_ok=True)

    for file in Path("../files").glob("*.pdf"):
        print(f"\n�� Procesando archivo: {file.name}")
        document_text = extract_text_from_pdf(str(file))

        state_data = {}

        for section, chain in chains.items():
            try:
                print(f"  �� Extrayendo sección: {section}")
                response_dict = chain.invoke(input=document_text)
                
                # Extract the actual response text from the dictionary
                response = response_dict[section]
                
                # Debug: Print the actual response to see what we're getting
                print(f"    Raw response: {response[:200]}...")  # First 200 chars
                
                # Clean the response - remove any non-JSON text
                response = response.strip()
                
                # Try to extract JSON from the response if it contains extra text
                if response.startswith('```json'):
                    response = response.split('```json')[1].split('```')[0].strip()
                elif response.startswith('```'):
                    response = response.split('```')[1].split('```')[0].strip()
                
                # Validate that we have content before parsing
                if not response:
                    print(f"    ⚠️ Empty response for {section}")
                    state_data[section] = None
                    continue
                
                parsed = json.loads(response)
                if section in parsed:
                    state_data[section] = parsed[section]
                else:
                    print(f"  ⚠️ Clave esperada '{section}' no encontrada en la respuesta.")
                    state_data[section] = None

                
            except json.JSONDecodeError as e:
                print(f"  ⚠️ JSON parsing error for '{section}': {e}")
                print(f"    Response content: {response}")
                state_data[section] = None
            except Exception as e:
                print(f"  ⚠️ Error al procesar '{section}': {e}")
                state_data[section] = None

        # Construye el objeto ResearchState
        try:
            research_state = ResearchState.model_validate(state_data)
        except Exception as e:
            print(f"❌ Error al crear ResearchState: {e}")
            continue

        # Guarda JSON
        json_path = Path(output_dir) / f"{file.stem}.json"
        with open(json_path, "w", encoding="utf-8") as f:
            f.write(research_state.model_dump_json(indent=2))

        # Guarda CSV (aplanado)
        df = pd.json_normalize(research_state.model_dump())
        csv_path = Path(output_dir) / f"{file.stem}.csv"
        df.to_csv(csv_path, index=False)

        print(f"✅ Archivo procesado: {file.name} → {json_path.name}, {csv_path.name}")


In [18]:
run_extraction_pipeline(chains)


�� Procesando archivo: 2401.04122v3.pdf
  �� Extrayendo sección: bibliographic_metadata
    Raw response: {
  "bibliographic_metadata": {
    "paper_id": "arXiv:2401.04122v3",
    "title": "From Prompt Engineering to Prompt Science With Human in the Loop",
    "authors": [
      "Chirag Shah"
    ],
    "...
  �� Extrayendo sección: methodological_framework
    Raw response: {
  "methodological_framework": {
    "analysis_type": "deductive qualitative analysis with human-in-the-loop validation, inspired by qualitative codebook construction and coding",
    "theoretical_fr...
  �� Extrayendo sección: technical_pipeline
    Raw response: {
  "technical_pipeline": {
    "automation_level": "semi-automated (human-in-the-loop with LLMs for labeling, coding, and prompt generation; automation increases with validated prompts and codebooks,...
  �� Extrayendo sección: prompt_engineering
    Raw response: {
  "prompt_engineering": {
    "prompting_strategy": {
      "approach_type": "multi-pha

### Working data

In [25]:
def load_all_json_to_dataframe(outputs_path="../research_assistant/outputs"):
    """
    Load all JSON files from the outputs directory and combine them into a single DataFrame.
    
    Args:
        outputs_path (str): Path to the directory containing JSON files
        
    Returns:
        pd.DataFrame: Combined DataFrame with all JSON data
    """
    import pandas as pd
    from pathlib import Path
    import json
    
    # Get all JSON files in the directory
    json_files = list(Path(outputs_path).glob("*.json"))
    
    if not json_files:
        print("No JSON files found in the specified directory")
        return pd.DataFrame()
    
    # List to store all dataframes
    all_dataframes = []
    
    for json_file in json_files:
        try:
            # Read JSON file
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Convert to DataFrame
            df = pd.json_normalize(data)
            
            # Add source file information
            df['source_file'] = json_file.name
            
            all_dataframes.append(df)
            print(f"✅ Loaded: {json_file.name}")
            
        except Exception as e:
            print(f"❌ Error loading {json_file.name}: {e}")
            continue
    
    if not all_dataframes:
        print("No valid JSON files could be loaded")
        return pd.DataFrame()
    
    # Combine all dataframes
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    print(f"\n📊 Combined DataFrame shape: {combined_df.shape}")
    print(f"📁 Total files processed: {len(all_dataframes)}")
    
    return combined_df

# Example usage
df_combined = load_all_json_to_dataframe()

✅ Loaded: 2401.04122v3.json
✅ Loaded: 2402.01386v1.json
✅ Loaded: 2411.14473v4.json
✅ Loaded: 3636555.3636910.json
✅ Loaded: 3706468.3706564.json
✅ Loaded: ChatGPT_ICQE_FinalVersion.json
✅ Loaded: extracted_pages_134_149.json
✅ Loaded: tai-et-al-2024-an-examination-of-the-use-of-large-language-models-to-aid-analysis-of-textual-data.json

📊 Combined DataFrame shape: (8, 36)
📁 Total files processed: 8


In [26]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 36 columns):
 #   Column                                                                    Non-Null Count  Dtype 
---  ------                                                                    --------------  ----- 
 0   bibliographic_metadata.paper_id                                           8 non-null      object
 1   bibliographic_metadata.title                                              8 non-null      object
 2   bibliographic_metadata.authors                                            8 non-null      object
 3   bibliographic_metadata.publication_year                                   8 non-null      int64 
 4   bibliographic_metadata.venue                                              8 non-null      object
 5   bibliographic_metadata.doi_url                                            8 non-null      object
 6   bibliographic_metadata.research_domain                                    8 no

In [27]:
df_combined

Unnamed: 0,bibliographic_metadata.paper_id,bibliographic_metadata.title,bibliographic_metadata.authors,bibliographic_metadata.publication_year,bibliographic_metadata.venue,bibliographic_metadata.doi_url,bibliographic_metadata.research_domain,methodological_framework.analysis_type,methodological_framework.theoretical_framework,methodological_framework.sample_characteristics.interview_count,...,empirical_results.evaluation_framework.performance_indicators,empirical_results.methodological_limitations,quality_assurance.reliability_measures,quality_assurance.validity_approaches,quality_assurance.bias_mitigation_strategies,research_impact.novel_contributions,research_impact.practical_applications,research_impact.future_research_directions,research_impact.scalability_considerations,source_file
0,arXiv:2401.04122v3,From Prompt Engineering to Prompt Science With...,[Chirag Shah],2023,Proceedings of ACM Conference (Conference’17),https://doi.org/10.1145/nnnnnnn.nnnnnnn,"Computational qualitative analysis, Human-cent...",deductive qualitative analysis with human-in-t...,"systematic, multi-phase prompt science methodo...",not_specified,...,High inter-coder reliability between human ann...,[Increased cost and resource requirements due ...,[Involvement of at least two qualified researc...,[Establishment and iterative refinement of cle...,[Involvement of multiple researchers to dissol...,"[Proposes a multi-phase, human-in-the-loop met...",[Automated and semi-automated labeling of inte...,[Exploring the automation of human-in-the-loop...,The methodology increases process cost by at l...,2401.04122v3.json
1,arXiv:2402.01386v1,Can Large Language Models Serve as Data Analys...,"[Zeeshan Rasheed, Muhammad Waseem, Aakash Ahma...",2018,Proceedings of ACM Conference (Conference’17),https://doi.org/XXXXXXX.XXXXXXX,"Software Engineering, Qualitative Data Analysi...",deductive_qualitative_analysis,multi-agent LLM-based automation of qualitativ...,10,...,87% practitioner satisfaction rate; positive q...,[Limited sample size for practitioner evaluati...,[Practitioner-based evaluation involving 10 pr...,[Engagement of practitioners from academia and...,[Selection of practitioners from diverse domai...,[Introduction of an LLM-based multi-agent mode...,[Automated and expedited qualitative analysis ...,[Exploration of the model's performance and ad...,The multi-agent LLM-based model significantly ...,2402.01386v1.json
2,arXiv:2411.14473v4,Large Language Model for Qualitative Research:...,"[Cauã Ferreira Barros, Bruna Borges Azevedo, V...",2025,arXiv,https://arxiv.org/abs/2411.14473,"Qualitative Research Methodology, Computationa...",deductive_qualitative_analysis,content_analysis; grounded_theory; thematic_an...,3,...,LLMs demonstrated performance equivalent to or...,[Strong dependence on well-structured prompt e...,"[Comparison with human analysis, Accuracy, Pre...",[Manual review and full reading of articles to...,[Human verification of LLM outputs and transla...,[Systematic mapping of the state of the art on...,[Automation of open and axial coding in qualit...,[Refinement of prompt engineering techniques t...,LLMs enable scalable qualitative analysis by a...,2411.14473v4.json
3,not_specified,Prompt-based and Fine-tuned GPT Models for Con...,"[Chenyu Hou, Gaoxia Zhu, Juan Zheng, Lishan Zh...",2024,The 14th Learning Analytics and Knowledge Conf...,https://doi.org/10.1145/3636555.3636910,"Learning Analytics, Educational Technology, Co...",deductive qualitative coding; automated and se...,"coding scheme based on cognitive, emotional, a...",7482,...,Cohen's Kappa scores for each coding dimension...,[Limited size of expert-labeled dataset (204 c...,[Cohen’s Kappa calculated between human raters...,[Expert-labeled data used as ground truth for ...,[Iterative prompt engineering to reduce ambigu...,[First systematic evaluation of GPT-based prom...,[Automated or semi-automated deductive coding ...,[Expansion of expert-labeled datasets and vali...,The pipeline is designed to be scalable to lar...,3636555.3636910.json
4,not_specified,When the Prompt becomes the Codebook: Grounded...,"[Sriram Ramanathan, Lisa-Angelique Lim, Nazani...",2025,LAK 2025: The 15th International Learning Anal...,https://doi.org/10.1145/3706468.3706564,"Learning Analytics, Qualitative Research Metho...",deductive qualitative analysis using automated...,Four domains of belonging framework (Ahn & Dav...,860,...,Substantial agreement between human and LLM co...,[Human-coded sample comprised only 2% of the f...,[Inter-Rater Reliability (IRR) between human c...,[Theory-driven codebook development grounded i...,[Human-in-the-loop iterative prompt and codebo...,[Introduction of Grounded Prompt Engineering (...,[Automated or semi-automated deductive coding ...,[Testing the GROPROE process and developed pro...,GROPROE enables deductive qualitative analysis...,3706468.3706564.json
5,not_specified,From nCoder to ChatGPT: From Automated Coding ...,"[Andres Felipe Zambrano, Xiner Liu, Amanda Bar...",2024,not_specified,not_specified,"Qualitative Research Methodology, Computationa...",deductive qualitative coding; automated and se...,Quantitative Ethnography; construct validity; ...,200,...,nCoder achieved higher average Kappa (0.77 tra...,[nCoder's reliance on regular expressions limi...,"[Cohen's Kappa, Precision, Recall, Shaffer's r...",[Construct validity assessment via iterative r...,[Inclusion of original dataset author to reduc...,[Systematic comparison of ChatGPT (GPT-4) and ...,[Use of ChatGPT as a semi-automated coding ass...,[Exploration of ChatGPT and other LLMs for sup...,Manual coding is unsuitable for large datasets...,ChatGPT_ICQE_FinalVersion.json
6,not_specified,ChatGPT for Education Research: Exploring the ...,"[Amanda Barany, Nidhi Nasiar, Chelsea Porter, ...",2024,"AIED 2024, LNAI 14830, pp. 134–149, Springer N...",https://doi.org/10.1007/978-3-031-64299-9_10,Educational Research / Qualitative Data Analys...,inductive qualitative coding (codebook develop...,not_specified,4,...,Hybrid approaches achieved the highest ratings...,[Study focused on inductive codebook developme...,[Cohen’s kappa (κ) used to assess consistency ...,[Human review and revision of codebooks genera...,[Independent codebook development by separate ...,[Systematic comparison of four codebook develo...,[Acceleration of qualitative codebook developm...,[Further investigation into optimal division o...,Hybrid human-LLM approaches demonstrated impro...,extracted_pages_134_149.json
7,16094069241231168,An Examination of the Use of Large Language Mo...,"[Robert H. Tai, Lillian R. Bentley, Xin Xia, J...",2024,International Journal of Qualitative Methods,https://doi.org/10.1177/16094069241231168,"qualitative research methodology, computationa...",deductive qualitative coding using large langu...,codebook-driven deductive analysis based on pr...,125,...,LLMq values plateaued and stabilized after app...,[LLMs rely on patterns in their training data;...,[Inter-rater reliability using Cohen’s kappa s...,[Use of a detailed codebook with clear definit...,[Multiple coders independently coding and reso...,[Proposes a systematic methodology for using L...,[LLMs can be used as an efficient screening to...,[Exploration of LLMs for inductive qualitative...,LLMs offer high scalability for qualitative an...,tai-et-al-2024-an-examination-of-the-use-of-la...
