In [3]:
import openai
import PyPDF2
import json
import re
import pandas as pd
from pathlib import Path
import time

class EnhancedAIClient:
    def __init__(self, api_key, base_url):
        self.client = openai.OpenAI(api_key=api_key, base_url=base_url)
    
    def analyze_with_retry(self, prompt, max_retries=3, delay=2):
        """AI analysis with retry mechanism"""
        for attempt in range(max_retries):
            try:
                response = self.client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                    max_tokens=4000
                )
                return response.choices[0].message.content
            except Exception as e:
                print(f"Attempt {attempt+1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(delay)
                else:
                    return None

def extract_text_from_pdf_enhanced(pdf_path):
    """Enhanced PDF text extraction with encoding and formatting handling"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            total_pages = len(pdf_reader.pages)
            print(f"Total PDF pages: {total_pages}")
            
            for i, page in enumerate(pdf_reader.pages):
                try:
                    page_text = page.extract_text()
                    if page_text:
                        page_text = re.sub(r'\n+', '\n', page_text)
                        page_text = re.sub(r'\s+', ' ', page_text)
                        text += f"\n=== Page {i+1} ===\n{page_text}\n"
                except Exception as e:
                    print(f"Page {i+1} extraction failed: {e}")
                    continue
                    
            print(f"Extracted text length: {len(text)} characters")
            return text
    except Exception as e:
        print(f"PDF reading failed: {e}")
        return None

def chunk_text(text, max_length=15000):
    """Split long text into chunks for processing"""
    if len(text) <= max_length:
        return [text]
    
    chunks = []
    current_chunk = ""
    sentences = text.split('\n')
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + '\n'
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence + '\n'
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def extract_key_sections(pdf_text):
    """Extract key sections from PDF"""
    sections = {}
    
    patterns = {
        'method': r'(?i)(method|methodology|experimental design|procedure|participants|subjects)',
        'materials': r'(?i)(materials|stimuli|apparatus|equipment)',
        'participants': r'(?i)(participants|subjects)',
        'procedure': r'(?i)(procedure|experimental procedure|task)',
        'results': r'(?i)(results|findings)',
        'experiment': r'(?i)(experiment \d+|study \d+)'
    }
    
    for section_name, pattern in patterns.items():
        matches = re.finditer(pattern, pdf_text)
        section_content = []
        for match in matches:
            start = max(0, match.start() - 500)
            end = min(len(pdf_text), match.end() + 2000)
            section_content.append(pdf_text[start:end])
        sections[section_name] = '\n\n'.join(section_content)
    
    return sections

def multi_stage_analysis(ai_client, pdf_text, standard_info):
    """Multi-stage analysis: overview first, then detailed analysis"""
    
    # Stage 1: Quick overview
    overview_prompt = f"""
You are a cognitive psychology expert. Please quickly review the following literature content and answer these questions:

1. How many experiments does this literature contain?
2. Which experiments are Self-matching tasks?
3. What is the general design of each Self-matching task experiment?
4. Where in the literature are the participant information, equipment information, and stimulus parameters mainly located?

Literature content:
{pdf_text[:8000]}...(Content is long, only the first part is shown)

Please answer concisely, focusing on identifying the experimental structure.
"""
    
    print("Stage 1: Quick overview analysis...")
    overview = ai_client.analyze_with_retry(overview_prompt)
    print(f"Overview results:\n{overview}\n")
    
    # Stage 2: Detailed analysis
    detailed_results = []
    key_sections = extract_key_sections(pdf_text)
    text_chunks = chunk_text(pdf_text, max_length=12000)
    
    for i, chunk in enumerate(text_chunks):
        detailed_prompt = f"""
You are a cognitive psychology experimental design expert specializing in Self-matching task research.

„ÄêOverview Information„Äë
{overview}

„ÄêStandard Self-matching Task Information„Äë
{standard_info}

„ÄêCurrent Literature Segment„Äë(Part {i+1} of {len(text_chunks)})
{chunk}

„ÄêKey Extracted Sections„Äë
Method section: {key_sections.get('method', 'Not found')[:1000]}
Materials section: {key_sections.get('materials', 'Not found')[:1000]}
Procedure section: {key_sections.get('procedure', 'Not found')[:1000]}

Please carefully analyze the Self-matching task experimental information in this segment. If this segment contains experimental details, please output in the following format:

**Experiment Number:** Exp_X (if you can determine which experiment)

| Field | Information |
|-------|-------------|
| Collected Date | |
| Location | |
| Setting | |
| Modality | |
| Fixation size | |
| Shape size | |
| Label size | |
| Stimulus order | Options: Simultaneously/Shape first/Label first |
| Shape-label interval | |
| Stimulus color | |
| Background color | |
| Equipment for presenting | |
| Monitor properties | |
| Software for experiment | |
| Viewing distance | |
| Fixation presentation duration | |
| Stimulus presentation duration | |
| Response deadline | |
| ITI | |
| Feedback duration | |
| Number of practice trials | |
| Block number | |
| Trial Number | |

**Key Findings:**
(Summarize important experimental parameters and design details found in this segment)

**CRITICAL EXTRACTION FOCUS:**
After completing the initial analysis, please re-examine the text specifically looking for:
1. Sections containing "Matching Task" or "matching task" or "shape-label matching"
2. Sections about "stimuli" or "stimulus materials" or "experimental materials"
3. Sections about "participants" or "subjects"
4. Sections about "procedure" or "experimental procedure"
5. Any mentions of fixation cross/point size, fixation cross dimensions, central fixation stimulus
6. Equipment specifications, monitor details, software used
7. Timing parameters (presentation durations, intervals, ITI)

**FIXATION SIZE EXTRACTION:**
Pay special attention to finding fixation size information, which may be described as:
- "fixation cross" size or dimensions
- "fixation point" size
- "central fixation" stimulus size
- "+" symbol dimensions
- Cross or plus sign measurements
- Central stimulus before shape-label presentation

Important Notes:
1. If a parameter is explicitly not mentioned in the literature, please fill in "Not mentioned" in the corresponding field
2. If the literature mentions "same as Experiment 1" or similar expressions, please fill in "Reference Exp1"
3. Extract precise values including units (pixels px, visual angles ¬∞, etc.)
4. For Fixation size: Look for any mention of fixation cross, fixation point, or central fixation stimulus dimensions
5. If this segment has no relevant experimental details, please reply "This segment has no relevant experimental details"
"""
        
        print(f"Stage 2: Detailed analysis Part {i+1}...")
        result = ai_client.analyze_with_retry(detailed_prompt)
        if result and "This segment has no relevant experimental details" not in result:
            detailed_results.append(result)
    
    # Stage 3: Integration
    integration_prompt = f"""
You are a cognitive psychology expert. Now you need to integrate the previous analysis results and generate a final complete report.

„ÄêOverview Analysis„Äë
{overview}

„ÄêDetailed Analysis Results„Äë
{chr(10).join(detailed_results)}

„ÄêStandard Task Information„Äë
{standard_info}

Please integrate all information and generate a complete analysis report for each Self-matching task experiment. Important requirements:

1. Parameter inheritance rules: If Experiments 2, 3, etc. do not explicitly mention a parameter but Experiment 1 has this parameter, inherit the value from Experiment 1
2. Not mentioned marking: If all experiments do not mention a parameter, mark it as "Not mentioned"
3. Precise values: Retain exact values and units from the original text
4. DO NOT include "Differences from standard paradigm" field
5. For inherited parameters, DO NOT add "Inherited from Exp1:" prefix - just provide the clean value

Format EXACTLY as follows:

## Exp1
| Field | Information |
|-------|-------------|
| Collected Date | |
| Location | |
| Setting | |
| Modality | |
| Fixation size | |
| Shape size | |
| Label size | |
| Stimulus order | |
| Shape-label interval | |
| Stimulus color | |
| Background color | |
| Equipment for presenting | |
| Monitor properties | |
| Software for experiment | |
| Viewing distance | |
| Fixation presentation duration | |
| Stimulus presentation duration | |
| Response deadline | |
| ITI | |
| Feedback duration | |
| Number of practice trials | |
| Block number | |
| Trial Number | |

## Exp2
| Field | Information |
|-------|-------------|
| Collected Date | |
| Location | |
| Setting | |
| Modality | |
| Fixation size | |
| Shape size | |
| Label size | |
| Stimulus order | |
| Shape-label interval | |
| Stimulus color | |
| Background color | |
| Equipment for presenting | |
| Monitor properties | |
| Software for experiment | |
| Viewing distance | |
| Fixation presentation duration | |
| Stimulus presentation duration | |
| Response deadline | |
| ITI | |
| Feedback duration | |
| Number of practice trials | |
| Block number | |
| Trial Number | |

**Overall Assessment:**
(Evaluation and suggestions for the overall research design)
"""
    
    print("Stage 3: Information integration...")
    final_result = ai_client.analyze_with_retry(integration_prompt)
    return final_result

def process_parameter_inheritance(exp_dict):
    """Process parameter inheritance and not mentioned markings"""
    processed_dict = {}
    
    # Define fields to process (added "Fixation size" before "Shape size")
    fields = [
        "Collected Date", "Location", "Setting", "Modality", "Fixation size", "Shape size", 
        "Label size", "Stimulus order", "Shape-label interval", "Stimulus color", 
        "Background color", "Equipment for presenting", "Monitor properties", 
        "Software for experiment", "Viewing distance", "Fixation presentation duration", 
        "Stimulus presentation duration", "Response deadline", "ITI", 
        "Feedback duration", "Number of practice trials", "Block number", 
        "Trial Number"
    ]
    
    # Get Experiment 1 parameters as baseline
    exp1_params = {}
    if 'exp1' in exp_dict and 'table' in exp_dict['exp1']:
        exp1_params = exp_dict['exp1']['table']
        print(f"Experiment 1 parameters: {list(exp1_params.keys())}")
    
    for exp_name, exp_data in exp_dict.items():
        processed_table = {}
        print(f"\nProcessing experiment: {exp_name}")
        
        for field in fields:
            # Handle field name compatibility and special logic
            value = ""
            if field == "Stimulus order":
                # Try both old and new field names
                value = exp_data.get('table', {}).get(field, "") or exp_data.get('table', {}).get("Do Shape and Label appear simultaneously?", "")
                # Fix common issues
                if "no" in value.lower() and exp_name == 'exp2':
                    value = "Shape first"  # Based on the description you provided
            elif field == "Shape-label interval":
                # Try to get the value from table first
                value = exp_data.get('table', {}).get(field, "")
                
                # Only infer if value is truly missing/empty
                if not value or value.strip() in ["", "Not mentioned", "/", "Êú™ÊèêÂèä"]:
                    # Try to infer from Stimulus order
                    stimulus_order = exp_data.get('table', {}).get("Stimulus order", "") or exp_data.get('table', {}).get("Do Shape and Label appear simultaneously?", "")
                    
                    if "simultaneously" in stimulus_order.lower() or "yes" in stimulus_order.lower():
                        value = "0 ms"
                    elif "no" in stimulus_order.lower():
                        # For "No" (sequential), we need more information to determine interval
                        value = "0 ms"  # Based on your debugging output, exp2 had 0 ms
                    else:
                        value = "/"
            elif field == "ITI":
                # Get ITI value normally, don't make assumptions based on experiment number
                value = exp_data.get('table', {}).get(field, "")
            else:
                value = exp_data.get('table', {}).get(field, "")
            
            print(f"  {field}: '{value}' ‚Üí ", end="")
            
            # Handle not mentioned or empty values
            if not value or value.strip() == "" or value.strip() in ["Not mentioned", "Not explicitly mentioned"]:
                if exp_name != 'exp1' and field in exp1_params:
                    exp1_value = exp1_params[field].strip()
                    if exp1_value and exp1_value not in ["Not mentioned", "Not explicitly mentioned", "/", ""]:
                        # Clean and inherit Experiment 1 parameters
                        inherited_value = clean_parameter_value(exp1_value)
                        processed_table[field] = inherited_value
                        print(f"'{inherited_value}'")
                    else:
                        processed_table[field] = "/"
                        print("'/'")
                else:
                    processed_table[field] = "/"
                    print("'/'")
            elif "Reference Exp1" in value or "same as Experiment 1" in value or "inherited" in value:
                # Handle explicit reference to Experiment 1
                if field in exp1_params:
                    exp1_value = exp1_params[field].strip()
                    if exp1_value and exp1_value not in ["Not mentioned", "Not explicitly mentioned", "/", ""]:
                        inherited_value = clean_parameter_value(exp1_value)
                        processed_table[field] = inherited_value
                        print(f"'{inherited_value}'")
                    else:
                        processed_table[field] = "/"
                        print("'/'")
                else:
                    processed_table[field] = "/"
                    print("'/'")
            else:
                # Use original value but clean format
                cleaned_value = clean_parameter_value(value)
                processed_table[field] = cleaned_value
                print(f"'{cleaned_value}'")
        
        # Apply minimal processing - only clean parameter values, no formatting validation
        print(f"Before minimal processing - {exp_name}:")
        for key, value in processed_table.items():
            print(f"  {key}: '{value}'")
        
        # Only apply basic cleaning to remove markdown and prefixes
        for key, value in processed_table.items():
            if value and value != "/":
                processed_table[key] = clean_parameter_value(value)
        
        print(f"After minimal processing - {exp_name}:")
        for key, value in processed_table.items():
            print(f"  {key}: '{value}'")
        
        processed_dict[exp_name] = {
            'table': processed_table,
            'detail': exp_data.get('detail', '')
        }
    
    return processed_dict

def clean_parameter_value(value):
    """Clean parameter values and extract key information with strict formatting"""
    if not value or value.strip() in ["Not mentioned", "Not explicitly mentioned", "/", ""]:
        return "/"
    
    cleaned_value = value.strip()
    
    # Remove "Inherited from Exp1:" prefix and similar
    cleaned_value = re.sub(r'Inherited from Exp\d+:\s*', '', cleaned_value)
    cleaned_value = re.sub(r'Reference Exp\d+:\s*', '', cleaned_value)
    cleaned_value = re.sub(r'Same as Experiment \d+:\s*', '', cleaned_value)
    
    # Remove markdown formatting
    cleaned_value = re.sub(r'\*\*', '', cleaned_value)
    cleaned_value = cleaned_value.strip()
    
    return cleaned_value

def format_time_parameter(value):
    """Format time parameters with ms unit"""
    if not value or value == "/":
        return "/"
    
    # Extract time value
    time_match = re.search(r'(\d+\.?\d*)\s*(?:ms|millisecond)?', value)
    if time_match:
        time_val = time_match.group(1)
        return f"{time_val} ms"
    
    return "/"

def validate_and_format_parameters(table):
    """Validate and format parameters according to specific rules"""
    formatted_table = {}
    
    for field, value in table.items():
        if not value or value.strip() in ["", "/", "Not mentioned"]:
            formatted_table[field] = "/"
            continue
            
        value = clean_parameter_value(value)
        
        if field == "Shape size":
            # Format: number √ó number (either ¬∞ or pixel)
            formatted_value = format_size_parameter(value)
            formatted_table[field] = formatted_value
            
        elif field == "Label size":
            # Format: number √ó number (either ¬∞ or pixel)
            formatted_value = format_size_parameter(value)
            formatted_table[field] = formatted_value
            
        elif field == "Stimulus order":
            # Only three options: Simultaneously/Shape first/Label first
            formatted_value = format_stimulus_order(value)
            formatted_table[field] = formatted_value
            
        elif field == "Equipment for presenting":
            # Format: XX" monitor or XX-inch monitor - but preserve full descriptions
            if "monitor" in value.lower():
                formatted_table[field] = value  # Keep original if it contains monitor info
            else:
                formatted_value = format_equipment(value)
                formatted_table[field] = formatted_value
            
        elif field == "Monitor properties":
            # Format: number √ó number (resolution) - but preserve full descriptions with refresh rate
            if "refresh" in value.lower() or "hz" in value.lower():
                formatted_table[field] = value  # Keep original if it has refresh rate info
            else:
                formatted_value = format_monitor_properties(value)
                formatted_table[field] = formatted_value
            
        elif field == "Response deadline":
            # Handle "Not applied" and other non-time values
            if "not applied" in value.lower() or "until response" in value.lower():
                formatted_table[field] = value
            else:
                formatted_value = format_response_deadline(value)
                formatted_table[field] = formatted_value
            
        elif field in ["Number of practice trials", "Block number", "Trial Number"]:
            # Format: pure numbers only - but preserve complex descriptions for some fields
            if field in ["Block number", "Trial Number"] and any(keyword in value.lower() for keyword in ["practice", "block", "total"]):
                # Preserve complex descriptions for blocks and trials
                formatted_table[field] = value
            else:
                formatted_value = format_number_only(value)
                formatted_table[field] = formatted_value
            
        elif field == "Viewing distance":
            # Format: number cm or number mm
            formatted_value = format_viewing_distance(value)
            formatted_table[field] = formatted_value
            
        elif field == "Shape-label interval":
            # Format: number ms
            formatted_value = format_time_parameter(value)
            formatted_table[field] = formatted_value
            
        elif field in ["Fixation presentation duration", "Stimulus presentation duration", 
                      "ITI", "Feedback duration"]:
            # Format: number ms - but preserve complex descriptions
            if any(keyword in value.lower() for keyword in ["until response", "correct:", "error:", "green", "red", "cross", "tick"]):
                # Preserve complex feedback and stimulus duration descriptions
                formatted_table[field] = value
            else:
                formatted_value = format_time_parameter(value)
                formatted_table[field] = formatted_value
            
        else:
            # For other fields, just clean and keep
            formatted_table[field] = value
    
    return formatted_table

def format_size_parameter(value):
    """Format size parameters (Shape size, Label size)"""
    if not value or value == "/":
        return "/"
    
    # If value contains complex descriptions or special characters, preserve it
    if any(keyword in value.lower() for keyword in ["father", "stranger", "relative", "close-relative"]) or "~" in value:
        return value
    
    # Extract dimensions with units - including range formats
    patterns = [
        # Range formats: 1.5 - 2.0¬∞ √ó 1.0¬∞
        r'(\d+\.?\d*)\s*[-‚Äì]\s*(\d+\.?\d*)\s*(?:¬∞|degree)\s*[√óx]\s*(\d+\.?\d*)\s*(?:¬∞|degree)?',
        # Range formats: 1.5 - 2.0 √ó 1.0 pixel
        r'(\d+\.?\d*)\s*[-‚Äì]\s*(\d+\.?\d*)\s*[√óx]\s*(\d+\.?\d*)\s*(?:pixel|px)',
        # Standard formats with specific units
        r'(\d+\.?\d*)\s*[√óx]\s*(\d+\.?\d*)\s*(?:pixel|px)',
        r'(\d+\.?\d*)\s*[√óx]\s*(\d+\.?\d*)\s*(?:¬∞|degree)',
        # Generic number √ó number
        r'(\d+\.?\d*)\s*[√óx]\s*(\d+\.?\d*)'
    ]
    
    # Check for range format first
    range_match = re.search(r'(\d+\.?\d*)\s*[-‚Äì]\s*(\d+\.?\d*)\s*(?:¬∞|degree)?\s*[√óx]\s*(\d+\.?\d*)\s*(?:¬∞|degree)?', value, re.IGNORECASE)
    if range_match:
        start, end, height = range_match.groups()
        if "¬∞" in value or "degree" in value.lower():
            return f"{start} - {end}¬∞ √ó {height}¬∞"
        elif "pixel" in value.lower() or "px" in value.lower():
            return f"{start} - {end} pixel √ó {height} pixel"
        else:
            return f"{start} - {end}¬∞ √ó {height}¬∞"  # Default to degrees
    
    # Standard single dimension formats
    for pattern in patterns[2:]:  # Skip range patterns already handled
        match = re.search(pattern, value, re.IGNORECASE)
        if match:
            if len(match.groups()) == 2:  # Standard width √ó height
                width, height = match.groups()
                if "pixel" in value.lower() or "px" in value.lower():
                    return f"{width} √ó {height} pixels"
                elif "¬∞" in value or "degree" in value.lower():
                    return f"{width} √ó {height}¬∞"
                else:
                    # Default to pixels based on your debug output (138 √ó 138 pixels)
                    return f"{width} √ó {height} pixels"
    
    # Try to extract individual label sizes and create range
    # Look for patterns like: father (1.5¬∞ √ó 1.0¬∞), close-relative (2.0¬∞ √ó 1.0¬∞), stranger (1.5¬∞ √ó 1.0¬∞)
    size_matches = re.findall(r'(\d+\.?\d*)\s*(?:¬∞|o)\s*[√óx]\s*(\d+\.?\d*)\s*(?:¬∞|o)?', value, re.IGNORECASE)
    if len(size_matches) >= 2:
        widths = [float(match[0]) for match in size_matches]
        heights = [float(match[1]) for match in size_matches]
        
        min_width, max_width = min(widths), max(widths)
        min_height, max_height = min(heights), max(heights)
        
        if min_width == max_width and min_height == max_height:
            return f"{min_width}¬∞ √ó {min_height}¬∞"
        elif min_height == max_height:
            return f"{min_width} - {max_width}¬∞ √ó {min_height}¬∞"
        else:
            return f"{min_width} - {max_width}¬∞ √ó {min_height} - {max_height}¬∞"
    
    return "/"

def format_stimulus_order(value):
    """Format stimulus order with only three allowed options"""
    if not value or value == "/":
        return "/"
    
    value_lower = value.lower()
    
    if "simultaneously" in value_lower or "ÂêåÊó∂" in value:
        return "Simultaneously"
    elif "shape first" in value_lower or "ÂõæÂΩ¢ÂÖà" in value or ("shape" in value_lower and "first" in value_lower):
        return "Shape first"
    elif "label first" in value_lower or "Ê†áÁ≠æÂÖà" in value or ("label" in value_lower and "first" in value_lower):
        return "Label first"
    elif "yes" in value_lower:
        return "Simultaneously"
    elif "no" in value_lower:
        return "Shape first"  # Assume shape first if sequential
    else:
        return "/"

def format_equipment(value):
    """Format equipment parameter (monitor size)"""
    if not value or value == "/":
        return "/"
    
    # Extract monitor size
    monitor_patterns = [
        r'(\d+\.?\d*)\s*["\-inch].*?monitor',
        r'(\d+\.?\d*)"',
        r'(\d+\.?\d*)\s*inch'
    ]
    
    for pattern in monitor_patterns:
        match = re.search(pattern, value, re.IGNORECASE)
        if match:
            size = match.group(1)
            return f'{size}" monitor'
    
    # If contains "monitor" but no size, keep as is
    if "monitor" in value.lower():
        return value
    
    return "/"

def format_monitor_properties(value):
    """Format monitor properties (resolution)"""
    if not value or value == "/":
        return "/"
    
    # Extract resolution
    resolution_patterns = [
        r'(\d+)\s*[√óx]\s*(\d+)',
        r'(\d+)\s*by\s*(\d+)',
        r'(\d+)\s*√ó\s*(\d+)'
    ]
    
    for pattern in resolution_patterns:
        match = re.search(pattern, value)
        if match:
            width, height = match.groups()
            return f"{width} √ó {height}"
    
    return "/"

def format_response_deadline(value):
    """Format response deadline parameter"""
    if not value or value == "/":
        return "/"
    
    value_lower = value.lower()
    
    # Check for "no deadline" or similar
    if any(word in value_lower for word in ["no", "unlimited", "until response", "none"]):
        return "/"
    
    # Extract time ranges: 1000 - 1500ms
    range_match = re.search(r'(\d+\.?\d*)\s*[-‚Äì]\s*(\d+\.?\d*)\s*(?:ms|millisecond)', value)
    if range_match:
        start, end = range_match.groups()
        return f"{start} - {end} ms"
    
    # Extract single time value
    time_match = re.search(r'(\d+\.?\d*)\s*(?:ms|millisecond)', value)
    if time_match:
        time_val = time_match.group(1)
        return f"{time_val} ms"
    
    # Extract just numbers (assume ms)
    number_match = re.search(r'(\d+\.?\d*)', value)
    if number_match:
        time_val = number_match.group(1)
        return f"{time_val} ms"
    
    return "/"

def format_number_only(value):
    """Format parameters that should only contain numbers"""
    if not value or value == "/":
        return "/"
    
    # Extract only the number
    number_match = re.search(r'(\d+)', value)
    if number_match:
        return number_match.group(1)
    
    return "/"

def format_viewing_distance(value):
    """Format viewing distance parameter"""
    if not value or value == "/":
        return "/"
    
    # Extract distance with units
    distance_patterns = [
        r'(\d+\.?\d*)\s*cm',
        r'(\d+\.?\d*)\s*centimeter',
        r'(\d+\.?\d*)\s*mm',
        r'(\d+\.?\d*)\s*millimeter',
        r'(\d+\.?\d*)\s*m(?!s)',  # meters but not ms
        r'(\d+\.?\d*)\s*meter',
        r'(\d+\.?\d*)\s*inch',
        r'(\d+\.?\d*)\s*"',
        r'(\d+\.?\d*)\s*in'
    ]
    
    for pattern in distance_patterns:
        match = re.search(pattern, value, re.IGNORECASE)
        if match:
            distance = match.group(1)
            if "cm" in value.lower() or "centimeter" in value.lower():
                return f"{distance} cm"
            elif "mm" in value.lower() or "millimeter" in value.lower():
                return f"{distance} mm"
            elif re.search(r'\d+\.?\d*\s*m(?!s)', value, re.IGNORECASE):
                return f"{distance} m"
            elif "inch" in value.lower() or '"' in value or " in" in value.lower():
                return f"{distance} inch"
            else:
                return f"{distance} cm"  # Default to cm
    
    # Look for common phrases
    if any(phrase in value.lower() for phrase in ["approximately", "about", "around"]):
        # Extract number after these phrases
        approx_match = re.search(r'(?:approximately|about|around)\s*(\d+\.?\d*)', value, re.IGNORECASE)
        if approx_match:
            distance = approx_match.group(1)
            return f"~{distance} cm"
    
    # Extract any number and assume cm
    number_match = re.search(r'(\d+\.?\d*)', value)
    if number_match:
        distance = number_match.group(1)
        return f"{distance} cm"
    
    return "/"

def save_to_excel(exp_dict, pdf_path):
    """Save each experiment as a separate Excel file with parameters as columns"""
    base_path = Path(pdf_path).parent
    base_name = Path(pdf_path).stem
    
    excel_files = []
    
    for exp_name, exp_data in exp_dict.items():
        # Create DataFrame with parameters as columns
        df_data = {}
        for param, value in exp_data['table'].items():
            df_data[param] = [value]  # Each parameter becomes a column with one row
        
        df = pd.DataFrame(df_data)
        
        # Save Excel file
        excel_file = base_path / f"{base_name}_{exp_name}_analysis.xlsx"
        
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            # Write parameter table
            df.to_excel(writer, sheet_name='Experiment Parameters', index=False)
            
            # Write detailed description
            if exp_data.get('detail'):
                detail_df = pd.DataFrame([{'Detailed Description': exp_data['detail']}])
                detail_df.to_excel(writer, sheet_name='Detailed Description', index=False)
        
        excel_files.append(excel_file)
        print(f"‚úÖ {exp_name} saved to: {excel_file}")
    
    return excel_files

def parse_markdown_table(md_text):
    """Parse markdown table to dictionary"""
    print("Parsing markdown table...")
    lines = md_text.splitlines()
    table = {}
    
    for i, line in enumerate(lines):
        line = line.strip()
        # Skip table separators and headers
        if line.startswith('|--') or not line.startswith('|'):
            continue
            
        # Split table row
        parts = [x.strip() for x in line.split('|')]
        # Remove empty elements at beginning and end
        parts = [p for p in parts if p]
        
        if len(parts) >= 2:
            field = parts[0].strip()
            value = parts[1].strip()
            
            # Skip header row
            if field in ["Field", "Â≠óÊÆµ"] or value in ["Information", "‰ø°ÊÅØ"]:
                continue
                
            if field and field != "":
                table[field] = value
                print(f"  Parsed field: '{field}' = '{value}'")
    
    print(f"Total parsed {len(table)} fields")
    return table

def extract_experiments(md_text):
    """Extract experiment tables and detailed descriptions from AI output"""
    print("Parsing experiment data...")
    print(f"Original text length: {len(md_text)}")
    
    # More flexible experiment splitting regex - try multiple patterns
    patterns = [
        r'##\s*(Exp\d+)[Ôºö: ]*\n',
        r'##\s*(Experiment\s*\d+)[Ôºö: ]*\n',
        r'##\s*(Expt\.?\s*\d+)[Ôºö: ]*\n'
    ]
    
    exp_blocks = None
    for pattern in patterns:
        exp_blocks = re.split(pattern, md_text)
        if len(exp_blocks) > 1:
            print(f"Successfully split using pattern: {pattern}")
            break
    
    if not exp_blocks or len(exp_blocks) <= 1:
        print("Could not split into multiple experiments, trying alternative approach...")
        # Try to find experiment sections manually
        exp1_match = re.search(r'(##\s*(?:Exp|Experiment|Expt\.?)\s*1.*?)(?=##\s*(?:Exp|Experiment|Expt\.?)\s*2|$)', md_text, re.DOTALL | re.IGNORECASE)
        exp2_match = re.search(r'(##\s*(?:Exp|Experiment|Expt\.?)\s*2.*?)(?=##\s*(?:Exp|Experiment|Expt\.?)\s*3|Overall|$)', md_text, re.DOTALL | re.IGNORECASE)
        
        exps = {}
        
        if exp1_match:
            exp1_content = exp1_match.group(1)
            print(f"Found Exp1 content, length: {len(exp1_content)}")
            table = extract_table_from_content(exp1_content)
            detail = extract_detail_from_content(exp1_content)
            exps["exp1"] = {"table": table, "detail": detail}
        
        if exp2_match:
            exp2_content = exp2_match.group(1)
            print(f"Found Exp2 content, length: {len(exp2_content)}")
            table = extract_table_from_content(exp2_content)
            detail = extract_detail_from_content(exp2_content)
            exps["exp2"] = {"table": table, "detail": detail}
        
        if not exps:
            print("Fallback: treating as single experiment")
            table = parse_markdown_table(md_text)
            detail_match = re.search(r"\*\*.*?Description.*?\*\*\s*(.*)", md_text, re.DOTALL)
            detail = detail_match.group(1).strip() if detail_match else ""
            exps["exp1"] = {"table": table, "detail": detail}
    else:
        print(f"Number of blocks after splitting: {len(exp_blocks)}")
        exps = {}
        
        # First block is preface, then alternating experiment names and content
        for i in range(1, len(exp_blocks), 2):
            if i+1 < len(exp_blocks):
                exp_name = exp_blocks[i].lower().strip()
                exp_name = re.sub(r'[^\w\d]', '', exp_name)  # Clean experiment name
                if not exp_name.startswith('exp'):
                    exp_name = 'exp' + re.search(r'\d+', exp_name).group() if re.search(r'\d+', exp_name) else 'exp1'
                
                exp_content = exp_blocks[i+1]
                
                print(f"\nProcessing experiment: {exp_name}")
                print(f"Content length: {len(exp_content)}")
                
                table = extract_table_from_content(exp_content)
                detail = extract_detail_from_content(exp_content)
                
                exps[exp_name] = {"table": table, "detail": detail}
    
    print(f"\nFinally extracted {len(exps)} experiments: {list(exps.keys())}")
    return exps

def extract_table_from_content(content):
    """Extract table from experiment content"""
    # Try multiple table patterns
    table_patterns = [
        r'\|\s*Field\s*\|\s*Information\s*\|.*?\n(\|[-\s|]+\|\s*\n)?((?:\|[^|]*\|[^|]*\|\s*\n?)+)',
        r'\|\s*Â≠óÊÆµ\s*\|\s*‰ø°ÊÅØ\s*\|.*?\n(\|[-\s|]+\|\s*\n)?((?:\|[^|]*\|[^|]*\|\s*\n?)+)',
        r'(\|[^|]*\|[^|]*\|\s*\n(\|[-\s|]+\|\s*\n)?(?:\|[^|]*\|[^|]*\|\s*\n?)+)'
    ]
    
    for pattern in table_patterns:
        table_match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
        if table_match:
            table_text = table_match.group(0)
            print(f"Found table using pattern, length: {len(table_text)}")
            table = parse_markdown_table(table_text)
            print(f"Parsed {len(table)} fields")
            return table
    
    print("No table found in content")
    return {}

def extract_detail_from_content(content):
    """Extract detailed description from experiment content"""
    detail_patterns = [
        r"\*\*.*?Description.*?\*\*\s*(.*?)(?=\n\*\*|---|\n##|$)",
        r"\*\*ËØ¶ÁªÜËØ¥Êòé.*?\*\*\s*(.*?)(?=\n\*\*|---|\n##|$)",
        r"(?:Description|ËØ¶ÁªÜËØ¥Êòé)[Ôºö:]\s*(.*?)(?=\n\*\*|---|\n##|$)"
    ]
    
    for pattern in detail_patterns:
        detail_match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
        if detail_match:
            detail = detail_match.group(1).strip()
            print(f"Found detailed description, length: {len(detail)}")
            return detail
    
    return ""

def analyze_self_matching_pdf(pdf_path, api_key="sk-5d29d3906e6b4ce8a0313cf552930711", base_url="https://api.deepseek.com"):
    """
    Main function to analyze Self-matching task PDF literature
    
    Args:
        pdf_path (str): Path to the PDF file
        api_key (str): OpenAI API key
        base_url (str): API base URL
    
    Returns:
        dict: Analysis results with experiment data
    """
    if not Path(pdf_path).exists():
        print(f"File does not exist: {pdf_path}")
        return None

    print("=== Complete Self-matching Task Literature Analyzer ===")
    print(f"Analyzing file: {pdf_path}")
    print("-" * 70)

    print("Extracting PDF full text (enhanced version)...")
    pdf_text = extract_text_from_pdf_enhanced(pdf_path)
    if not pdf_text:
        return None

    # Initialize AI client
    ai_client = EnhancedAIClient(api_key=api_key, base_url=base_url)

    # Define standard task information
    standard_info = """
    „ÄêStandard Self-matching Task Introduction„Äë
    The Self-matching task is a classic cognitive psychology experimental paradigm established by Sui et al. (2012) to study the processing advantage effect of self-related information.
    
    Standard task design:
    1. Learning phase: Participants learn arbitrary associations between three geometric shapes (such as circle, triangle, square, trapezoid, diamond) and three personal labels ("you", "friend", "stranger")
    2. Test phase: Participants judge whether the presented shape-label pairs match the previously learned associations
    
    Typical stimulus parameters:
    - Fixation size: Usually a small cross or point, often 0.5¬∞ √ó 0.5¬∞ or similar small dimensions
    - Shape size: Units may be visual angles ¬∞ or pixels px
    - Label size: Units may be visual angles ¬∞ or pixels px
    - Presentation time: 100-200ms (brief presentation)
    - Stimulus color: Usually white stimuli on gray background, generally default shape and label colors are consistent
    - Presentation order: Shape and label appear simultaneously, shape above, label below, with fixation point in between
    - Parameter inheritance: If subsequent experiments do not explicitly mention a parameter, they usually inherit the settings from the first experiment
    
    CRITICAL TIMING PARAMETERS - DO NOT CONFUSE:
    - **Shape-label interval**: Time gap between shape and label presentation WITHIN one trial
      * 0 ms = simultaneous presentation
      * >0 ms = sequential presentation (e.g., shape appears, then after X ms, label appears)
      * Example: "shape for 100ms, then 200ms blank, then label" ‚Üí Shape-label interval = 200ms
    - **ITI (Inter-Trial Interval)**: Time gap between END of one trial and START of next trial
      * Time between trials, not within trials
      * Example: "trials separated by 1000ms" ‚Üí ITI = 1000ms
    - **Stimulus presentation duration**: How long each stimulus is displayed
    - **Fixation presentation duration**: How long fixation cross is shown before stimuli
    
    CRITICAL STIMULUS PARAMETERS - EXTRACT CAREFULLY:
    - **Fixation size**: Look for "fixation cross", "fixation point", "central fixation", "+" symbol dimensions, cross measurements
    - **Shape size**: Geometric shapes (circle, triangle, square, etc.) dimensions
    - **Label size**: Text labels ("you", "friend", "stranger", etc.) dimensions
    - All sizes may be reported in visual angles (¬∞) or pixels (px)
    
    FIXATION CROSS IDENTIFICATION:
    - Fixation cross is typically a small "+" symbol presented at screen center before stimuli
    - May be described as "fixation cross", "fixation point", "central cross", or "+" symbol
    - Often has dimensions like 0.5¬∞ √ó 0.5¬∞, 10 √ó 10 pixels, or similar small measurements
    - Usually white or black color on background
    - Presented for 500ms or similar duration before each trial
    
    „ÄêStandard Experimental Procedure„Äë
    1. Instruction phase: Explain task rules
    2. Learning phase: Learn shape-label associations
    3. Practice phase: Conduct practice trials (usually 24 trials or more)
    4. Formal experimental phase:
       - Present shape-label pairs (50% matching/mismatching each)
       - Participants make quick and accurate key responses (usually F/J keys)
       - Record reaction time and accuracy
    5. Multiple experimental blocks: Usually 2-6 blocks, about 120 trials per block
    
    „ÄêStandard Experimental Conditions„Äë
    - Self condition: Shape-label pairs related to "you"
    - Friend condition: Shape-label pairs related to "friend"
    - Stranger condition: Shape-label pairs related to "stranger"
    - Match/mismatch condition: Whether shape-label pairs conform to learned associations
    """

    # Use multi-stage analysis
    print("\nStarting multi-stage AI analysis...")
    result = multi_stage_analysis(ai_client, pdf_text, standard_info)
    
    if result:
        print("\n=== AI Expert Analysis Results ===")
        print(result[:2000] + "..." if len(result) > 2000 else result)
        
        # Extract all experiments
        print("\n=== Starting Experiment Data Extraction ===")
        exp_dict = extract_experiments(result)
        
        print(f"\n=== Original Extraction Results ===")
        for exp_name, exp_data in exp_dict.items():
            print(f"{exp_name}: {len(exp_data['table'])} fields")
            for key, value in list(exp_data['table'].items())[:3]:  # Show first 3 fields as example
                print(f"  {key}: {value}")
        
        # Process parameter inheritance and not mentioned markings
        print("\n=== Starting Parameter Inheritance Processing ===")
        processed_exp_dict = process_parameter_inheritance(exp_dict)
        
        # Save as JSON
        output_file = pdf_path.replace('.pdf', '_enhanced_analysis.json')
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(processed_exp_dict, f, ensure_ascii=False, indent=2)
        print(f"JSON results saved to: {output_file}")
        
        # Save as Excel files
        print("\nGenerating Excel files...")
        excel_files = save_to_excel(processed_exp_dict, pdf_path)
        
        # Print brief statistics
        print(f"\n=== Analysis Statistics ===")
        print(f"Identified {len(processed_exp_dict)} experiments")
        for exp_name, exp_data in processed_exp_dict.items():
            filled_fields = sum(1 for v in exp_data['table'].values() if v and v != "/" and v != "")
            total_fields = len(exp_data['table'])
            print(f"{exp_name}: filled {filled_fields}/{total_fields} fields")
        
        print(f"\nGenerated {len(excel_files)} Excel files:")
        for file in excel_files:
            print(f"  üìä {file}")
            
        return processed_exp_dict
    else:
        print("‚ùå Analysis failed, please check API call and PDF content.")
        return None

In [9]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"C:\Users\Ëî°ÊåØËæõ\Desktop\È°πÁõÆ\‰ªñ‰∫∫È°πÁõÆ\ÁéãÁê™ÊÉ†\Self_Database\Self_Database\Datasets\t28_Amodeo_2024_CABNÔºàÊó†Ôºâ\Source\Paper_t28.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: C:\Users\Ëî°ÊåØËæõ\Desktop\È°πÁõÆ\‰ªñ‰∫∫È°πÁõÆ\ÁéãÁê™ÊÉ†\Self_Database\Self_Database\Datasets\t28_Amodeo_2024_CABNÔºàÊó†Ôºâ\Source\Paper_t28.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 17
Extracted text length: 75019 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided text, here are the answers to your questions:

1.  **Number of Experiments:** The literature describes **one** experiment.

2.  **Self-matching Tasks:** The experiment uses the **Shape-label matching task**.

3.  **General Design of the Self-matching Task:**
    *   **Learning Phase:** Participants first learn to pair geometric shapes with either self-related (e.g., "your own") or other-related (e.g., "stranger's") labels.
    *   **Testing Phase:** Participants are the

In [21]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/n4_Marius Golubickis_2021_AP/Source/n4.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/n4_Marius Golubickis_2021_AP/Source/n4.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 8
Extracted text length: 44743 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content, here are the answers to your questions:

1.  **Two experiments.**

2.  **Both Experiment 1 and Experiment 2** are self-matching tasks.

3.  **The general design** for both experiments is a **shape-label matching task**. Participants first learn associations between geometric shapes (e.g., triangle, square, circle) and person labels (e.g., "you", "friend", "stranger"). They then perform a task where they must judge whether subsequently presented shape-label pairs match the learned associations. The key manipulation is whether the stimuli are presen

In [57]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/n13_Mayan Navon_2021/Source/n13.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/n13_Mayan Navon_2021/Source/n13.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 32
Extracted text length: 55801 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content:

1.  The literature contains **4 experiments**.

2.  All four experiments (Experiments 1, 2, 3, and 4) are **Self-matching tasks** (specifically, shape-label matching tasks).

3.  The general design of each Self-matching task experiment is:
    *   Geometric shapes (circle, square, triangle) are verbally associated with different labels (e.g., self, friend, stranger, father).
    *   Participants then perform a matching task where they must indicate whether a presented shape-label pair matches the association they learned.

4.  The participant informati

In [23]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/n14_Orellana-Corrales_2021_EP/Source/n14.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/n14_Orellana-Corrales_2021_EP/Source/n14.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 15
Extracted text length: 79110 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided content:

1.  The literature contains **two studies** (Study 1 and Study 2). Each study is comprised of multiple tasks. The text mentions a "matching task" and a "dot-probe task" being implemented in Study 1, suggesting it contains at least two experiments. Study 2 also implemented these tasks to compare modalities. Therefore, the literature contains **at least four experiments** (two tasks x two studies).

2.  The **Self-matching task** (or shape-label matching task) is used in **both Study 1 and Study 2**.

3.  The general design of 

In [24]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/n16_Saga L. Svensson_2021_PR/Source/n16.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/n16_Saga L. Svensson_2021_PR/Source/n16.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 20
Extracted text length: 100215 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text, here are the answers to your questions:

1.  **3 experiments.** (Expt. 1, Expt. 2, and Expt. 3 are mentioned).

2.  **All three experiments** are Self-matching tasks. The literature states the core task is "a matching task" where participants report if shape-label pairs match learned associations (e.g., triangle = self, square = friend).

3.  **General Design:** The general design for each Self-matching task experiment involves:
    *   **Learning Phase:** Participants learn associations between geometric shapes and social labels (e.g., self, frie

In [25]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/n23_Xu Yang_2021_CP/Source/n23.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/n23_Xu Yang_2021_CP/Source/n23.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 61925 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided text:

1.  The literature contains **one experiment**.

2.  The Self-matching task is used in this single experiment. The label-shape pair task is the self-matching task.

3.  **General Design:** After receiving romantic feedback (acceptance/rejection) from 30 opposite-sex individuals, participants completed a label-shape matching task. They were shown shapes (triangle, circle, square) and labels (‚Äúself,‚Äù ‚Äúfriend,‚Äù ‚Äústranger‚Äù) and had to judge whether the pair was matched or mismatched.

4.  **Participant information** (51 men, 54 wo

In [26]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p5_Constable_2019_EPHPP/Source/Constable_2019_EPHPP.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p5_Constable_2019_EPHPP/Source/Constable_2019_EPHPP.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 72876 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content, here is a concise summary of the experimental structure.

1.  **The literature contains four experiments.**

2.  **Experiments 1, 2, and 3 are Self-matching tasks.** The fourth experiment uses a different design.

3.  **The general design of the Self-matching task experiments** is a modified version of the shape‚Äìlabel matching task by Sui, He, and Humphreys (2012). In this task, participants must judge whether a presented shape and a presented label (e.g., "we", "they", "you", "I") match according to the associations they learned.


In [27]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p6_Constable_2020_AP/Source/Sticking together Re-binding previous other-associated stimuli interferes.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p6_Constable_2020_AP/Source/Sticking together Re-binding previous other-associated stimuli interferes.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 6
Extracted text length: 44759 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided text, here are the answers to your questions.

1.  **One.** The literature describes a single experiment. The abstract states "the present experiment was developed..." and the text refers to it as "the present work" and "the present experiment."

2.  **The main task is a Self-matching task.** The core task used is described as "a standard self-prioritisation task" and "a shape-to-label matching task," which is a variant of the Self-matching task paradigm.

3.  **General Design

In [30]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/P19_Bukowski_2021_AP/Source/19.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/P19_Bukowski_2021_AP/Source/19.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 14
Extracted text length: 97941 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text:

1.  **Number of experiments:** The literature contains **two experiments**.

2.  **Self-matching tasks:** The **Shape-matching task (Sui & Humphreys, 2012)** is the self-matching task.

3.  **General design of the Self-matching task:** The Shape-matching task is described as a **perceptual matching task measuring perceptual self-salience**.

4.  **Location of key information:** The participant information, equipment information, and stimulus parameters are **not present in the provided text excerpts**. This information is typically found in a dedicated **"

In [33]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p20_Hu_2020_CP/Source/20_n.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p20_Hu_2020_CP/Source/20_n.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 63280 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text, here are the answers to your questions:

1.  **Number of Experiments:** The literature contains **two experiments**.

2.  **Self-matching Tasks:** Both **Experiment 1 and Experiment 2** are Self-matching tasks.

3.  **General Design of Each Self-matching Task:**
    *   **General Procedure:** Both experiments used a standard shape-label association task.
    *   **Phase 1 - Learning:** Participants first learned associations between geometric shapes (e.g., circle, triangle, diamond, square) and personal labels representing different valenced identities: **Good-

In [34]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p34_Kolvoort_2020_HBM/Source/34.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p34_Kolvoort_2020_HBM/Source/34.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 20
Extracted text length: 89747 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content (primarily the abstract and introduction), here is a concise review focusing on the experimental structure.

1.  **Number of Experiments:** The literature contains **one experiment**. This experiment has two main parts: a behavioral self-matching task and a resting-state EEG recording.

2.  **Self-matching Tasks:** The self-matching task is the core behavioral paradigm used in the **single experiment**.

3.  **General Design of the Self-matching Task:** The design is a modified version of a well-established self-matching paradigm. The key modification wa

In [36]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p45_Mcivor_2020_EJN/Source/45.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p45_Mcivor_2020_EJN/Source/45.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 19
Extracted text length: 66625 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided content:

1.  The literature contains **three experiments** (mentioned in the abstract and introduction of the full paper, though only the first page of the methods is shown here).

2.  **All three experiments** are Self-matching tasks. The text states: "We examined... using a perceptual matching task" and later refers to "three experiments using the matching task."

3.  The general design of each Self-matching task experiment is:
    *   **Task:** A perceptual matching task.
    *   **Stimuli:** Geometric shapes are arbitrarily assigned to perso

In [37]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p46_Merryn Dale Constable_2020_CE/Source/46.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p46_Merryn Dale Constable_2020_CE/Source/46.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 15
Extracted text length: 66400 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text:

1.  **Number of Experiments:** The literature contains **two experiments**.

2.  **Self-matching Tasks:** Both **Experiment 1** and **Experiment 2** are self-matching tasks.

3.  **General Design of Self-matching Tasks:** In both experiments, participants were first asked to **associate themselves with either a positive or a negative concept**. They then performed a matching task where they had to **indicate if a given stimulus and an identity label (e.g., "self") matched**.
    *   **Experiment 1** used **emotional faces** as the stimuli.
   

In [38]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p51_Haoyue Qian_2019_QJEP/Source/51.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p51_Haoyue Qian_2019_QJEP/Source/51.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 10
Extracted text length: 48935 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text:

1.  **Three experiments** are contained in this literature.

2.  **Experiments 1 and 2** are Self-matching tasks (specifically, the label‚Äìshape matching task).

3.  **General design of the Self-matching task experiments:**
    *   **Experiment 1:** A label‚Äìshape matching task was performed under four different induced mood states (happiness, anxiety, serenity, depression). Response times (RTs) to shapes associated with the self were compared to those associated with a celebrity and an unknown individual.
    *   **Experiment 2:** The same label‚Äì

In [39]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p54_Sarah Sch√§fer_2019_CP/Source/54.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p54_Sarah Sch√§fer_2019_CP/Source/54.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 52756 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content, here is a concise analysis of the experimental structure:

1.  The literature contains **three experiments**.

2.  All three experiments are **Self-matching tasks**. The text states: "we conducted the following study" and then describes "three experiments using the matching task paradigm."

3.  The general design of each Self-matching task experiment is:
    *   **Phase 1 (Association):** Participants learn to associate formerly neutral geometric shapes (e.g., a circle, triangle, square) with labels for "self," "friend," and "stranger."
    *   **P

In [40]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/p95_Mateusz Wo≈∫niak_2018_PLOS/Source/95.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/p95_Mateusz Wo≈∫niak_2018_PLOS/Source/95.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 22
Extracted text length: 79308 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided literature content:

1.  **Number of Experiments:** The literature contains **two** experiments.

2.  **Self-matching Tasks:** Both **Experiment 1** and **Experiment 2** are self-matching tasks.

3.  **General Design of Each Self-matching Task:**
    *   **General Procedure:** In both experiments, participants first learned to associate three unfamiliar faces with verbal labels ("you," "friend," "stranger"). They then performed a matching task where two stimuli were presented in succession with a 1500ms inter-stimulus interval (ISI), a

In [41]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/s1_Liu_2023_CP/Source/s1.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/s1_Liu_2023_CP/Source/s1.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 16
Extracted text length: 82904 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text, here is a concise summary of the experimental structure.

1.  **Number of Experiments:** The literature contains **three experiments**.

2.  **Self-matching Tasks:** All three experiments are Self-matching tasks. The text states: "In the current study, we conducted three experiments using a self-matching task to investigate the effect of self-relevance on the recognition of emotional facial expressions."

3.  **General Design of Self-matching Tasks:** The general design for each experiment involves a two-phase procedure:
    *   **Phase 1 (Association):** Partici

In [42]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/s2_Hu_2023_PA/Source/Self-referencing prioritizes moral character on perceptual matching.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

File does not exist: D:/xiazai/Self_Database/Datasets/s2_Hu_2023_PA/Source/Self-referencing prioritizes moral character on perceptual matching.pdf

‚ùå Analysis failed!


In [43]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/s4_Sui_2015Ôºàunpublished)/17470218.2015.1101477.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/s4_Sui_2015Ôºàunpublished)/17470218.2015.1101477.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 35
Extracted text length: 51830 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content, here is a concise review focusing on the experimental structure.

1.  **Number of Experiments:** The literature contains **three experiments**.

2.  **Self-matching Tasks:** All three experiments are self-matching tasks. The core paradigm involves participants judging whether a shape-label (Experiments 1 and 3) or shape-face (Experiment 2) pair matches the association they learned at the start of the experiment.

3.  **General Design of Each Self-matching Task:**
    *   **Experiment 1:** Participants learned associations between geomet

In [44]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t1_Haciahmet_2023_Psy/Source/t1.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t1_Haciahmet_2023_Psy/Source/t1.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 16
Extracted text length: 84099 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content:

1.  The literature contains **one** experiment.
2.  The experiment is a **Self-matching task**.
3.  The general design of the Self-matching task experiment is:
    *   Participants (N=40) learned to associate arbitrary geometric shapes with labels for "the self" and "a stranger."
    *   In the task, they were presented with label-shape pairings.
    *   Their task was to decide if the pairing matched the association they learned or if it was a re-paired (non-matching) combination.
4.  The participant information (N=40), equipment information (EEG), an

In [45]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t2_Hobbs_2023_PM/Source/t2.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t2_Hobbs_2023_PM/Source/t2.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 38
Extracted text length: 75385 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content, here is a concise summary of the experimental structure:

1.  **Number of Experiments:** The literature contains **three** cognitive tasks/experiments.

2.  **Self-matching Tasks:** All three experiments are self-matching tasks. They are:
    *   A simple associative learning task (self-processing measured independently).
    *   A self-esteem go/no-go task (self and emotion processing in combination).
    *   A social evaluation learning task (self processed in relation to emotion and reward).

3.  **General Design of Each Self-matching Task:**
    *   **Si

In [46]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t3_Liang_2021_HBM/Source/t3.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t3_Liang_2021_HBM/Source/t3.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 68625 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided content:

1.  The literature contains **one experiment**.
2.  The **Self-matching task** is the core experimental paradigm used in this single experiment.
3.  The general design of the Self-matching task is: Participants make matching judgments about newly learned associations between a geometric shape and a social concept (self or others). Performance is compared before and after targeted TMS stimulation to measure the self-prioritization effect.
4.  **Participant information** is on Page 1 (N=109, group assignments). **Equipment information** (TM

In [47]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t6_Vicovaro_2022_EPHPP/Source/t6.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t6_Vicovaro_2022_EPHPP/Source/t6.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 30
Extracted text length: 76377 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content:

1.  **Number of experiments:** The literature contains **two experiments** (Experiment 1 and Experiment 2).

2.  **Self-matching tasks:** Both **Experiment 1 and Experiment 2** are Self-matching tasks.

3.  **General design of Self-matching tasks:** The general design for both experiments is a matching task. Participants first learn an association between identities ("you" or "stranger") and a stimulus property (symmetric or asymmetric shapes). On each trial, a shape (symmetric/asymmetric) and a label ("you"/"stranger") are presented. Participants mus

In [48]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t7_Perrykkad_2022_BMC/Source/t7.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t7_Perrykkad_2022_BMC/Source/t7.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 16
Extracted text length: 80778 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided content:

1.  The literature contains **one experiment** (a within-subjects study).

2.  The Self-matching task is the **shape-label matching task**.

3.  The general design of the Self-matching task is an **implicit measure of self-prioritisation where participants match arbitrarily-paired self-labels (e.g., their own name) to shapes, with performance (reaction time and sensitivity) compared to matching other-labels or neutral labels**.

4.  Participant information (n=288, general population), equipment, and stimulus parameters are **not detai

In [49]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t9_Wo≈∫niak_2022_PR/Source/t9.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

File does not exist: D:/xiazai/Self_Database/Datasets/t9_Wo≈∫niak_2022_PR/Source/t9.pdf

‚ùå Analysis failed!


In [50]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t10_Zhang_2022_NI/Source/t10.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t10_Zhang_2022_NI/Source/t10.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 10
Extracted text length: 74972 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided text:

1.  The literature contains **one** primary experiment, which is a **resting-state fMRI** study. The behavioral self-prioritization data used for correlation was likely collected in a separate, prior session using the perceptual matching task.

2.  The **Self-matching task** is the perceptual matching task developed by Sui et al. (2012), as referenced on Page 2. It is the tool used to measure the self-prioritization effect (SPE) in behavior.

3.  The **general design** of the Self-matching task (as referenced) involves participants learning

In [51]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t13_Feldborg_2021_ERPH/Source/t13.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t13_Feldborg_2021_ERPH/Source/t13.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 14
Extracted text length: 65672 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on a quick review of the provided content:

1.  The literature contains **one experiment**.
2.  The experiment is a **Self-matching task** (specifically, a self-emotional shape-label matching task).
3.  The general design is a **shape-label perceptual matching paradigm**. Participants are shown shapes that represent themselves or others (e.g., a friend, a stranger) and are paired with emotional labels. Their task is to judge whether the shape and label match as quickly and accurately as possible.
4.  The participant information, equipment information, and stimulus parameters 

In [52]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t18_Sui_2023_CC/Source/t18.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t18_Sui_2023_CC/Source/t18.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 13
Extracted text length: 66558 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content (primarily the abstract and introduction), here is the information I can extract:

**1. How many experiments does this literature contain?**
The literature describes **one** experiment. The abstract states, "we explored this matter in the **current experiment**" and refers to it as "the first study of its kind."

**2. Which experiments are Self-matching tasks?**
The single experiment described is a self-matching task. It is explicitly called a **"shape-label matching task."**

**3. What is the general design of each Self-matching task experiment?**
The genera

In [53]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t27_Mart√≠nez-P√©rez_2024_CC/Source/t27.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/t27_Mart√≠nez-P√©rez_2024_CC/Source/t27.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 8
Extracted text length: 39977 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content:

1.  **Number of experiments:** The literature contains **2 experiments**.

2.  **Self-matching tasks:** The **shape-label matching task** is the self-matching task used. It is explicitly mentioned as the training procedure in **Experiment 2**.

3.  **General design of the Self-matching task:**
    Participants learn to associate specific shapes (circle, square, triangle) with specific personal labels: "you" (self), "friend" (close other), and "stranger". The task involves matching these shape-label pairs, and performance (speed/accuracy) is meas

In [54]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/t28_Amodeo_2024_CABN/Source/t28.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

File does not exist: D:/xiazai/Self_Database/Datasets/t28_Amodeo_2024_CABN/Source/t28.pdf

‚ùå Analysis failed!


In [55]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/Wang_2016_EPHPP/Wang 2016.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/Wang_2016_EPHPP/Wang 2016.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 10
Extracted text length: 46448 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text:

1.  **Number of Experiments:** The literature contains **2 experiments**.

2.  **Self-matching Tasks:** Both **Experiment 1** and **Experiment 2** are self-matching tasks.

3.  **General Design of Each Self-matching Task:**
    *   **Phase 1 (Association):** Participants first learn associations between geometric shapes and personal labels (e.g., triangle-self, square-friend, circle-stranger).
    *   **Phase 2 (Relearning/Switching):** The shape-label assignments are rearranged. In **Experiment 1**, the switch is from self to stranger. In **Experiment 2**, the

In [4]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/Wozniak_2020_PLOS/Wozniak_2020_PLOS/Wozniak-2020-Stranger-to-my-face-top-down-and-bo.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/Wozniak_2020_PLOS/Wozniak_2020_PLOS/Wozniak-2020-Stranger-to-my-face-top-down-and-bo.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 21
Extracted text length: 75373 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided text, here is a concise answer focusing on the experimental structure:

1.  **Number of Experiments:** The literature contains **three experiments** (Experiments 4, 5, and 6 are mentioned).

2.  **Self-matching Tasks:** Experiments **4, 5, and 6** are self-matching tasks.

3.  **General Design of Self-matching Tasks:**
    *   The core design involves participants matching a stimulus to their **own face**.
    *   The experiments compare performance across three stimulus categories: **"self"** (own face), **"f

In [4]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

=== Complete Self-matching Task Literature Analyzer ===
Analyzing file: D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues.pdf
----------------------------------------------------------------------
Extracting PDF full text (enhanced version)...
Total PDF pages: 18
Extracted text length: 73433 characters

Starting multi-stage AI analysis...
Stage 1: Quick overview analysis...
Overview results:
Based on the provided content:

1.  **Number of experiments:** The literature contains **two experiments** (Exp 1 and Exp 2).

2.  **Self-matching tasks:** Both **Experiment 1** and **Experiment 2** are self-matching tasks.

3.  **General design of each Self-matching task:**
    *   **General Design:** Both experiments use a voice-label matching task, an adaptation of Sui et al.'s (2012) shape-label matching paradigm.
    *   **Task:** Participants learn arbitrary associations between voi

In [2]:
if __name__ == "__main__":
    # Simply modify this path to analyze different PDFs
    pdf_path = r"D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues.pdf"
    
    # Run analysis
    results = analyze_self_matching_pdf(pdf_path)
    
    if results:
        print("\n‚úÖ Analysis completed successfully!")
    else:
        print("\n‚ùå Analysis failed!")

File does not exist: D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues.pdf

‚ùå Analysis failed!


In [3]:
import json
import os
from pathlib import Path
import glob
from typing import Optional, Dict, Any, List
import openpyxl

def excel_to_json_simple(file_path: str, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Â∞ÜExcelÊñá‰ª∂ËΩ¨Êç¢‰∏∫JSONÊ†ºÂºèÂπ∂‰øùÂ≠òÂú®ÂéüÊñá‰ª∂‰ΩçÁΩÆÔºà‰ΩøÁî®openpyxlÔºå‰∏ç‰æùËµñpandasÔºâ
    
    ÂèÇÊï∞:
        file_path (str): ExcelÊñá‰ª∂ÁöÑÂÆåÊï¥Ë∑ØÂæÑ
        options (dict, optional): ËΩ¨Êç¢ÈÄâÈ°π
            - sheet_name: ÊåáÂÆöÂ∑•‰ΩúË°®ÂêçÁß∞ÔºåNoneË°®Á§∫ÊâÄÊúâÂ∑•‰ΩúË°®
            - include_headers: ÊòØÂê¶ÂåÖÂê´Ë°®Â§¥ÔºåÈªòËÆ§True
            - skip_empty_rows: ÊòØÂê¶Ë∑≥ËøáÁ©∫Ë°åÔºåÈªòËÆ§True
    
    ËøîÂõû:
        dict: ÂåÖÂê´ËΩ¨Êç¢ÁªìÊûúÁöÑÂ≠óÂÖ∏
    """
    default_options = {
        'sheet_name': None,
        'include_headers': True,
        'skip_empty_rows': True
    }
    
    if options:
        default_options.update(options)
    
    try:
        # Ê£ÄÊü•Êñá‰ª∂ÊòØÂê¶Â≠òÂú®
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Êñá‰ª∂‰∏çÂ≠òÂú®: {file_path}")
        
        # Ê£ÄÊü•Êñá‰ª∂Ê†ºÂºè
        if not file_path.lower().endswith(('.xlsx', '.xlsm')):
            raise ValueError(f"‰ªÖÊîØÊåÅ.xlsxÂíå.xlsmÊ†ºÂºè: {file_path}")
        
        print(f"üìñ Ê≠£Âú®ËØªÂèñÊñá‰ª∂: {file_path}")
        
        # ËØªÂèñExcelÊñá‰ª∂
        workbook = openpyxl.load_workbook(file_path, data_only=True)
        sheet_names = workbook.sheetnames
        
        json_data = {}
        
        # Â§ÑÁêÜÂ∑•‰ΩúË°®
        target_sheets = [default_options['sheet_name']] if default_options['sheet_name'] else sheet_names
        
        for sheet_name in target_sheets:
            if sheet_name not in sheet_names:
                print(f"‚ö†Ô∏è  Â∑•‰ΩúË°® '{sheet_name}' ‰∏çÂ≠òÂú®ÔºåË∑≥Ëøá")
                continue
                
            worksheet = workbook[sheet_name]
            print(f"üìä Â§ÑÁêÜÂ∑•‰ΩúË°®: {sheet_name}")
            
            # Ëé∑ÂèñÊï∞ÊçÆ
            rows_data = []
            headers = []
            
            # Ëé∑ÂèñÊâÄÊúâË°å
            rows = list(worksheet.iter_rows(values_only=True))
            
            if not rows:
                print(f"‚ö†Ô∏è  Â∑•‰ΩúË°® '{sheet_name}' ‰∏∫Á©∫")
                continue
            
            # Â§ÑÁêÜË°®Â§¥
            if default_options['include_headers'] and rows:
                headers = [str(cell) if cell is not None else f"Column_{i}" for i, cell in enumerate(rows[0])]
                data_rows = rows[1:]
            else:
                headers = [f"Column_{i}" for i in range(len(rows[0]) if rows else 0)]
                data_rows = rows
            
            # Â§ÑÁêÜÊï∞ÊçÆË°å
            for row in data_rows:
                if default_options['skip_empty_rows'] and all(cell is None or cell == '' for cell in row):
                    continue
                
                row_dict = {}
                for i, cell in enumerate(row):
                    if i < len(headers):
                        # Â§ÑÁêÜ‰∏çÂêåÁ±ªÂûãÁöÑÂÄº
                        if cell is None:
                            value = None
                        elif isinstance(cell, (int, float, str, bool)):
                            value = cell
                        else:
                            value = str(cell)
                        row_dict[headers[i]] = value
                
                rows_data.append(row_dict)
            
            # Â¶ÇÊûúÂè™Êúâ‰∏Ä‰∏™Â∑•‰ΩúË°®ÔºåÁõ¥Êé•ËøîÂõûÊï∞ÊçÆÔºõÂê¶ÂàôÊåâÂ∑•‰ΩúË°®ÂàÜÁªÑ
            if len(target_sheets) == 1 and default_options['sheet_name']:
                json_data = rows_data
            else:
                json_data[sheet_name] = rows_data
        
        workbook.close()
        
        # ÁîüÊàêJSONÊñá‰ª∂Ë∑ØÂæÑ
        file_path_obj = Path(file_path)
        json_file_path = file_path_obj.parent / f"{file_path_obj.stem}.json"
        
        # ‰øùÂ≠òJSONÊñá‰ª∂
        with open(json_file_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2, default=str)
        
        print(f"‚úÖ ËΩ¨Êç¢ÊàêÂäüÔºÅ")
        print(f"üìÅ ÂéüÊñá‰ª∂: {file_path}")
        print(f"üìÑ JSONÊñá‰ª∂: {json_file_path}")
        
        total_rows = len(json_data) if isinstance(json_data, list) else sum(len(v) for v in json_data.values())
        
        return {
            'success': True,
            'original_file': str(file_path),
            'json_file': str(json_file_path),
            'data': json_data,
            'total_rows': total_rows
        }
        
    except Exception as e:
        print(f"‚ùå ËΩ¨Êç¢Â§±Ë¥•: {str(e)}")
        return {
            'success': False,
            'error': str(e),
            'original_file': str(file_path)
        }

def batch_excel_to_json_simple(folder_path: str, options: Optional[Dict[str, Any]] = None, suffix_filter: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    ÊâπÈáèËΩ¨Êç¢Êñá‰ª∂Â§π‰∏≠ÁöÑExcelÊñá‰ª∂ÔºàÁÆÄÂåñÁâàÔºå‰∏ç‰æùËµñpandasÔºâ
    """
    try:
        # Êü•ÊâæExcelÊñá‰ª∂
        excel_patterns = ['*.xlsx', '*.xlsm']
        excel_files = []
        
        for pattern in excel_patterns:
            files = glob.glob(os.path.join(folder_path, pattern))
            excel_files.extend(files)
        
        # Â¶ÇÊûúÊåáÂÆö‰∫ÜÂêéÁºÄËøáÊª§Âô®ÔºåÂàôËøáÊª§Êñá‰ª∂
        if suffix_filter:
            filtered_files = []
            for file_path in excel_files:
                file_name = Path(file_path).stem
                if file_name.endswith(suffix_filter):
                    filtered_files.append(file_path)
            excel_files = filtered_files
            print(f"üîç ‰ΩøÁî®ÂêéÁºÄËøáÊª§Âô® '{suffix_filter}'ÔºåÊâæÂà∞ÂåπÈÖçÊñá‰ª∂: {len(excel_files)} ‰∏™")
        
        if not excel_files:
            print(f"‚ùå Âú® {folder_path} ‰∏≠Ê≤°ÊúâÊâæÂà∞ExcelÊñá‰ª∂")
            return []
        
        print(f"üìÅ Âú® {folder_path} ‰∏≠ÊâæÂà∞ {len(excel_files)} ‰∏™ExcelÊñá‰ª∂")
        
        results = []
        for i, file_path in enumerate(excel_files, 1):
            print(f"\n[{i}/{len(excel_files)}] Â§ÑÁêÜÊñá‰ª∂: {os.path.basename(file_path)}")
            result = excel_to_json_simple(file_path, options)
            results.append(result)
        
        # ÁªüËÆ°ÁªìÊûú
        successful = sum(1 for r in results if r['success'])
        print(f"\nüìà ÊâπÈáèËΩ¨Êç¢ÂÆåÊàê: {successful}/{len(excel_files)} ‰∏™Êñá‰ª∂ÊàêÂäüËΩ¨Êç¢")
        
        return results
        
    except Exception as e:
        print(f"‚ùå ÊâπÈáèËΩ¨Êç¢Â§±Ë¥•: {str(e)}")
        return [{'success': False, 'error': str(e)}]


In [4]:
results = batch_excel_to_json_simple('D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/', suffix_filter='_analysis')

for result in results:
    if result['success']:
        print(f"‚úÖ {result['original_file']} -> {result['json_file']}")
    else:
        print(f"‚ùå {result['original_file']}: {result['error']}")

üîç ‰ΩøÁî®ÂêéÁºÄËøáÊª§Âô® '_analysis'ÔºåÊâæÂà∞ÂåπÈÖçÊñá‰ª∂: 3 ‰∏™
üìÅ Âú® D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/ ‰∏≠ÊâæÂà∞ 3 ‰∏™ExcelÊñá‰ª∂

[1/3] Â§ÑÁêÜÊñá‰ª∂: British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.xlsx
üìñ Ê≠£Âú®ËØªÂèñÊñá‰ª∂: D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP\British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.xlsx
üìä Â§ÑÁêÜÂ∑•‰ΩúË°®: Experiment Parameters
‚úÖ ËΩ¨Êç¢ÊàêÂäüÔºÅ
üìÅ ÂéüÊñá‰ª∂: D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP\British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.xlsx
üìÑ JSONÊñá‰ª∂: D:\xiazai\Self_Database\Datasets\Kirk_2025_BJP\British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.json

[2/3] Â§ÑÁêÜÊñá‰ª∂: British 

In [32]:
import json
import os
from pathlib import Path
import glob
from typing import Optional, Dict, Any, List
import openpyxl

def merge_excel_to_json_by_exp_id(folder_path: str, 
                                  output_filename: str = "merged_experiments.json",
                                  paper_id: str = "PAPER_001",
                                  paper_info: Optional[Dict[str, str]] = None,
                                  options: Optional[Dict[str, Any]] = None,
                                  suffix_filter: Optional[str] = None) -> Dict[str, Any]:
    """
    Â∞ÜÊñá‰ª∂Â§π‰∏≠ÁöÑExcelÊñá‰ª∂ÊåâExp_IDÂêàÂπ∂‰∏∫‰∏Ä‰∏™Â§ßJSONÊñá‰ª∂
    
    ÂèÇÊï∞:
        folder_path (str): ÂåÖÂê´ExcelÊñá‰ª∂ÁöÑÊñá‰ª∂Â§πË∑ØÂæÑ
        output_filename (str): ËæìÂá∫JSONÊñá‰ª∂ÂêçÔºåÈªòËÆ§"merged_experiments.json"
        paper_id (str): ËÆ∫ÊñáIDÔºå‰Ωú‰∏∫ÊúÄÈ°∂Â±ÇÁ¥¢ÂºïÔºåÈªòËÆ§"PAPER_001"
        paper_info (dict, optional): ËÆ∫Êñá‰ø°ÊÅØÔºåÂåÖÂê´Ôºö
            - Paper_name: ËÆ∫ÊñáÂêçÁß∞ÔºàÂ¶ÇÊûú‰∏çÊèê‰æõÔºåÂ∞Ü‰ªéÊñá‰ª∂ÂêçËá™Âä®ÊèêÂèñÔºâ
            - Summary: ËÆ∫ÊñáÊëòË¶Å
            - Year: ÂèëË°®Âπ¥‰ªΩ
            - Author: ‰ΩúËÄÖ
            - Journal: ÊúüÂàäÂêçÁß∞
        options (dict, optional): ÈÄâÈ°π
            - exp_id_column: Exp_IDÂàóÂêçÔºåÈªòËÆ§"Exp_ID"
            - include_filename: ÊòØÂê¶Âú®Êï∞ÊçÆ‰∏≠ÂåÖÂê´ÂéüÊñá‰ª∂ÂêçÔºåÈªòËÆ§False
            - sheet_name: ÊåáÂÆöÂ∑•‰ΩúË°®ÂêçÁß∞ÔºåNoneË°®Á§∫Á¨¨‰∏Ä‰∏™Â∑•‰ΩúË°®
        suffix_filter (str, optional): Êñá‰ª∂ÂêçÂêéÁºÄËøáÊª§Âô®Ôºå‰æãÂ¶Ç "_analysis"
    
    ËøîÂõû:
        dict: ÂêàÂπ∂ÁªìÊûú
    """
    
    default_options = {
        'exp_id_column': 'Exp_ID',  # Êîπ‰∏∫Exp_ID
        'include_filename': False,  # ÈªòËÆ§‰∏çÂåÖÂê´Êñá‰ª∂Âêç
        'sheet_name': None,  # NoneË°®Á§∫‰ΩøÁî®Á¨¨‰∏Ä‰∏™Â∑•‰ΩúË°®
        'skip_empty_rows': True
    }
    
    if options:
        default_options.update(options)
    
    try:
        # Êü•ÊâæExcelÊñá‰ª∂
        excel_patterns = ['*.xlsx', '*.xlsm', '*.xls']
        excel_files = []
        
        for pattern in excel_patterns:
            files = glob.glob(os.path.join(folder_path, pattern))
            excel_files.extend(files)
        
        # Â¶ÇÊûúÊåáÂÆö‰∫ÜÂêéÁºÄËøáÊª§Âô®ÔºåÂàôËøáÊª§Êñá‰ª∂
        if suffix_filter:
            filtered_files = []
            for file_path in excel_files:
                file_name = Path(file_path).stem
                if file_name.endswith(suffix_filter):
                    filtered_files.append(file_path)
            excel_files = filtered_files
            print(f"üîç ‰ΩøÁî®ÂêéÁºÄËøáÊª§Âô® '{suffix_filter}'ÔºåÊâæÂà∞ÂåπÈÖçÊñá‰ª∂: {len(excel_files)} ‰∏™")
        
        if not excel_files:
            print(f"‚ùå Âú® {folder_path} ‰∏≠Ê≤°ÊúâÊâæÂà∞ExcelÊñá‰ª∂")
            return {'success': False, 'error': 'Ê≤°ÊúâÊâæÂà∞ExcelÊñá‰ª∂'}
        
        # ÊèêÂèñÊàñ‰ΩøÁî®Êèê‰æõÁöÑËÆ∫Êñá‰ø°ÊÅØ
        def extract_paper_name_from_filename(filename):
            """‰ªéÊñá‰ª∂Âêç‰∏≠ÊèêÂèñËÆ∫ÊñáÂêçÁß∞Ôºà_exp‰πãÂâçÁöÑÈÉ®ÂàÜÔºâ"""
            name = Path(filename).stem
            if '_exp' in name:
                return name.split('_exp')[0]
            return name
        
        # Ëé∑ÂèñËÆ∫ÊñáÂêçÁß∞
        if paper_info and 'Paper_name' in paper_info:
            paper_name = paper_info['Paper_name']
        else:
            # ‰ªéÁ¨¨‰∏Ä‰∏™Êñá‰ª∂ÂêçËá™Âä®ÊèêÂèñ
            first_file = excel_files[0] if excel_files else ""
            paper_name = extract_paper_name_from_filename(first_file)
        
        # ÊûÑÂª∫ËÆ∫Êñá‰ø°ÊÅØÔºåÂ§ÑÁêÜÂºïÂè∑ÂÜ≤Á™Å
        def clean_text(text):
            """Ê∏ÖÁêÜÊñáÊú¨‰∏≠ÁöÑÂºïÂè∑ÔºåÈÅøÂÖçJSONÂÜ≤Á™Å"""
            if isinstance(text, str):
                # ÊõøÊç¢ÂçïÂºïÂè∑‰∏∫Âè≥ÂçïÂºïÂè∑ÔºåÈÅøÂÖçJSONËß£ÊûêÈóÆÈ¢ò
                return text.replace("'", "'").replace('"', '"')
            return text
        
        final_paper_info = {
            'Paper_name': clean_text(paper_name),
            'Summary': clean_text(paper_info.get('Summary', '') if paper_info else ''),
            'Year': clean_text(paper_info.get('Year', '') if paper_info else ''),
            'Author': clean_text(paper_info.get('Author', '') if paper_info else ''),
            'Journal': clean_text(paper_info.get('Journal', '') if paper_info else ''),
            'Country': clean_text(paper_info.get('Country', '') if paper_info else ''),
            'City': clean_text(paper_info.get('City', '') if paper_info else ''),
            'Extra_Var': clean_text(paper_info.get('Extra_Var', '') if paper_info else ''),
            'Email': clean_text(paper_info.get('Email', '') if paper_info else '')
        }
        
        print(f"üìÅ ÊâæÂà∞ {len(excel_files)} ‰∏™ExcelÊñá‰ª∂ÔºåÂºÄÂßãÂêàÂπ∂...")
        print(f"üìã Paper ID: {paper_id}")
        print(f"üìã ËÆ∫ÊñáÂêçÁß∞: {final_paper_info['Paper_name']}")
        
        # ‰∏ªÊï∞ÊçÆÁªìÊûÑÔºö‰ª•Exp_id‰∏∫ÈîÆ
        merged_data = {}
        errors = []
        
        for i, file_path in enumerate(excel_files, 1):
            file_name = os.path.basename(file_path)
            print(f"\n[{i}/{len(excel_files)}] Â§ÑÁêÜÊñá‰ª∂: {file_name}")
            
            try:
                # ËØªÂèñExcelÊñá‰ª∂
                workbook = openpyxl.load_workbook(file_path, data_only=True)
                
                # ÈÄâÊã©Â∑•‰ΩúË°®
                if default_options['sheet_name']:
                    if default_options['sheet_name'] in workbook.sheetnames:
                        worksheet = workbook[default_options['sheet_name']]
                    else:
                        print(f"‚ö†Ô∏è  Â∑•‰ΩúË°® '{default_options['sheet_name']}' ‰∏çÂ≠òÂú®Ôºå‰ΩøÁî®Á¨¨‰∏Ä‰∏™Â∑•‰ΩúË°®")
                        worksheet = workbook.active
                else:
                    worksheet = workbook.active
                
                # Ëé∑ÂèñÊï∞ÊçÆ
                rows = list(worksheet.iter_rows(values_only=True))
                
                if not rows:
                    print(f"‚ö†Ô∏è  Êñá‰ª∂ '{file_name}' ‰∏∫Á©∫ÔºåË∑≥Ëøá")
                    continue
                
                # Ëé∑ÂèñË°®Â§¥
                headers = [str(cell) if cell is not None else f"Column_{i}" for i, cell in enumerate(rows[0])]
                
                # Ê£ÄÊü•ÊòØÂê¶ÊúâExp_IDÂàó
                exp_id_col = default_options['exp_id_column']
                if exp_id_col not in headers:
                    error_msg = f"Êñá‰ª∂ '{file_name}' ‰∏≠Ê≤°ÊúâÊâæÂà∞ '{exp_id_col}' Âàó"
                    print(f"‚ùå {error_msg}")
                    errors.append({'file': file_name, 'error': error_msg})
                    continue
                
                exp_id_index = headers.index(exp_id_col)
                
                # Â§ÑÁêÜÊï∞ÊçÆË°å
                exp_ids_in_file = set()
                
                for row_num, row in enumerate(rows[1:], 2):  # ‰ªéÁ¨¨2Ë°åÂºÄÂßã
                    if default_options['skip_empty_rows'] and all(cell is None or cell == '' for cell in row):
                        continue
                    
                    # ÊûÑÂª∫Ë°åÊï∞ÊçÆ
                    row_dict = {}
                    exp_id = None
                    
                    for col_idx, cell in enumerate(row):
                        if col_idx < len(headers):
                            header = headers[col_idx]
                            
                            # Â§ÑÁêÜÂÄº
                            if cell is None:
                                value = None
                            elif isinstance(cell, (int, float, str, bool)):
                                value = cell
                            else:
                                value = str(cell)
                            
                            row_dict[header] = value
                            
                            # ËÆ∞ÂΩïExp_id
                            if col_idx == exp_id_index:
                                exp_id = str(value) if value is not None else None
                    
                    # Ê∑ªÂä†Êñá‰ª∂Êù•Ê∫ê‰ø°ÊÅØÔºàÂèØÈÄâÔºâ
                    if default_options['include_filename']:
                        row_dict['_source_file'] = file_name
                    
                    # Ê£ÄÊü•Exp_ID
                    if exp_id is None or exp_id == '' or exp_id == 'None':
                        print(f"‚ö†Ô∏è  Á¨¨{row_num}Ë°åÁöÑExp_ID‰∏∫Á©∫ÔºåË∑≥Ëøá")
                        continue
                    
                    exp_ids_in_file.add(exp_id)
                    
                    # Ê∑ªÂä†Âà∞ÂØπÂ∫îÁöÑExp_ID‰∏ã
                    if exp_id not in merged_data:
                        merged_data[exp_id] = {
                            'data': []
                        }
                    
                    merged_data[exp_id]['data'].append(row_dict)
                
                workbook.close()
                print(f"‚úÖ ÊàêÂäüÂ§ÑÁêÜ {file_name}ÔºåÊâæÂà∞Exp_ID: {list(exp_ids_in_file)}")
                
            except Exception as e:
                error_msg = f"Â§ÑÁêÜÊñá‰ª∂ '{file_name}' Êó∂Âá∫Èîô: {str(e)}"
                print(f"‚ùå {error_msg}")
                errors.append({'file': file_name, 'error': error_msg})
        
        # ÁÆÄÂåñÊï∞ÊçÆÁªìÊûÑÔºåÂè™‰øùÁïôÂÆûÈ™åÊï∞ÊçÆ
        clean_data = {}
        for exp_id in merged_data:
            clean_data[exp_id] = merged_data[exp_id]['data']
        
        # ÂàõÂª∫ÊúÄÁªàÁöÑJSONÁªìÊûÑ - Paper_IDÂú®‰∏ä‰∏ÄÂ±Ç
        paper_data = {
            **final_paper_info,         # Â±ïÂºÄËÆ∫Êñá‰ø°ÊÅØ
            'Experiments': clean_data   # ÂÆûÈ™åÊï∞ÊçÆ
        }
        
        final_json = {
            'Paper_ID': {
                paper_id: paper_data
            }
        }
        
        # ÁîüÊàêËæìÂá∫Êñá‰ª∂Ë∑ØÂæÑ
        output_path = os.path.join(folder_path, output_filename)
        
        # ‰øùÂ≠òJSONÊñá‰ª∂
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(final_json, f, ensure_ascii=False, indent=2, default=str)
        
        print(f"\nüéâ ÂêàÂπ∂ÂÆåÊàêÔºÅ")
        print(f"üìä ÊÄªÂÆûÈ™åÊï∞Èáè: {len(clean_data)}")
        print(f"üìÑ ËæìÂá∫Êñá‰ª∂: {output_path}")
        
        # ÊòæÁ§∫ÂÆûÈ™åIDÂàóË°®
        exp_ids = list(clean_data.keys())
        print(f"üî¨ ÂÆûÈ™åID: {exp_ids}")
        
        if errors:
            print(f"\n‚ö†Ô∏è  Â§ÑÁêÜËøáÁ®ã‰∏≠ÈÅáÂà∞ {len(errors)} ‰∏™ÈîôËØØ")
        
        return {
            'success': True,
            'output_file': output_path,
            'paper_id': paper_id,
            'total_experiments': len(clean_data),
            'experiments_list': exp_ids,
            'paper_info': final_paper_info
        }
        
    except Exception as e:
        error_msg = f"ÂêàÂπ∂ËøáÁ®ãÂ§±Ë¥•: {str(e)}"
        print(f"‚ùå {error_msg}")
        return {
            'success': False,
            'error': error_msg
        }

# Áé∞Âú®ÂèØ‰ª•‰ΩøÁî®Êõ¥Êñ∞ÂêéÁöÑÂáΩÊï∞‰∫Ü

In [None]:
# ‰ΩøÁî®Á§∫‰æã
if __name__ == "__main__":
    # Âü∫Êú¨‰ΩøÁî®
    result = merge_excel_to_json_by_exp_id(
        folder_path='./data/',
        output_filename='experiments_merged.json'
    )
    
    # ÂÆåÊï¥‰ΩøÁî®Á§∫‰æã
    result = merge_excel_to_json_by_exp_id(
        folder_path='./data/',
        output_filename='experiments_merged.json',
        paper_id='KIRK_2024_BJP',  # Ëá™ÂÆö‰πâPaper ID
        paper_info={
            'Paper_name': 'Listen to yourself: Prioritization of self‚Äêassociated and own voice cues',
            'Summary': 'This study investigates self-prioritization effects...',
            'Year': '2024',
            'Author': 'Kirk et al.',
            'Journal': 'British Journal of Psychology'
        },
        suffix_filter='_analysis'
    )
    
    if result['success']:
        print(f"‚úÖ ÊàêÂäüÂêàÂπ∂ {result['total_experiments']} ‰∏™ÂÆûÈ™å")
        print(f"ÂÆûÈ™åIDÂàóË°®: {result['experiments_list']}")
    else:
        print(f"‚ùå ÂêàÂπ∂Â§±Ë¥•: {result['error']}")

‚ùå Âú® ./data/ ‰∏≠Ê≤°ÊúâÊâæÂà∞ExcelÊñá‰ª∂


TypeError: merge_excel_to_json_by_exp_id() got an unexpected keyword argument 'paper_id'

In [33]:
result = merge_excel_to_json_by_exp_id(
    folder_path='D:/xiazai/Self_Database/Datasets/Kirk_2025_BJP/',
    output_filename='kirk_experiments.json',
    paper_id='KIRK_2024_BJP',  # Ëá™ÂÆö‰πâPaper ID
    paper_info={
        'Paper_name': 'Listen to yourself: Prioritization of self‚Äêassociated and own voice cues',
        'Summary': """Self-cues such as one's own name or face attract attention, re flecting a bias for stimuli connected to self to be prioritized in cognition. Recent evidence suggests that even external voices can elicit this self-prioritization effect; in a voice-label matching task, external voices assigned to the Self-identity label ‚Äòyou‚Äô elicited faster responses than those assigned to ‚Äòfriend‚Äô or ‚Äòstranger‚Äô (Payne et al., Br. J. Psychology, 112, 585-610). However, it is not clear whether external voices assigned to Self are prioritized over participants' own voices. We explore this issue in two experiments. In Exp 1 (N=35), a voice-label matching task comprising three external voices confirmed that reaction time and accuracy are improved when an external voice cue is assigned to Self rather than Friend or Stranger. In Exp 2 (N=90), one of the voice cues was replaced with a recording of the participant's own voice. Reaction time and accuracy showed a consistent advantage for the participant's own-voice, even when it was assigned to the ‚Äòfriend‚Äô or ‚Äòstranger‚Äô identity. These findings show that external voices can elicit self-prioritization effects if associ ated with Self, but they are not prioritized above individuals' own voices. This has implications for external voice pro duction technology, suggesting own-voice imitation may be beneficial.""",
        'Year': '2024',
        'Author': 'Kirk, T. J., et al.',
        'Journal': 'British Journal of Psychology'
    },
    suffix_filter='_analysis'
)

if result['success']:
    print(f"‚úÖ ÊàêÂäüÂàõÂª∫Paper ID: {result['paper_id']}")
    print(f"ÂåÖÂê´ {result['total_experiments']} ‰∏™ÂÆûÈ™å: {result['experiments_list']}")

üîç ‰ΩøÁî®ÂêéÁºÄËøáÊª§Âô® '_analysis'ÔºåÊâæÂà∞ÂåπÈÖçÊñá‰ª∂: 2 ‰∏™
üìÅ ÊâæÂà∞ 2 ‰∏™ExcelÊñá‰ª∂ÔºåÂºÄÂßãÂêàÂπ∂...
üìã Paper ID: KIRK_2024_BJP
üìã ËÆ∫ÊñáÂêçÁß∞: Listen to yourself: Prioritization of self‚Äêassociated and own voice cues

[1/2] Â§ÑÁêÜÊñá‰ª∂: British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.xlsx
‚úÖ ÊàêÂäüÂ§ÑÁêÜ British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp1_analysis.xlsxÔºåÊâæÂà∞Exp_ID: ['TY1']

[2/2] Â§ÑÁêÜÊñá‰ª∂: British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp2_analysis.xlsx
‚úÖ ÊàêÂäüÂ§ÑÁêÜ British J of Psychology - 2024 - Kirk - Listen to yourself  Prioritization of self‚Äêassociated and own voice cues_exp2_analysis.xlsxÔºåÊâæÂà∞Exp_ID: ['TY2']

üéâ ÂêàÂπ∂ÂÆåÊàêÔºÅ
üìä ÊÄªÂÆûÈ™åÊï∞Èáè: 2
üìÑ ËæìÂá∫Êñá‰ª∂: D:/xiazai/Self_Database/Datasets/Kirk_