In [3]:
from typing import List, Dict, Optional, Tuple
from pydantic import BaseModel, Field
import pandas as pd
import csv
from tqdm.auto import tqdm
import re
from datetime import date
from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
# Define a structured output model for the detailed MIC information
class MICDetailedInfo(BaseModel):
    """Schema for detailed information about a Militarized Interstate Confrontation."""
    MICdate: date = Field(description="The date when the MIC occurred (YYYY-MM-DD format, or as specific as possible)")
    fatality_min: int = Field(description="The minimum number of fatalities (use same number as max if precise)")
    fatality_max: int = Field(description="The maximum number of fatalities (use same number as min if precise)")
    countries_involved: List[str] = Field(description="List of countries involved in the confrontation")
    initiator_country: Optional[str] = Field(description="The country that initiated the confrontation, if identifiable")
    target_country: Optional[str] = Field(description="The country that was targeted in the confrontation, if identifiable")

def init_llm() -> ChatOpenAI:
    """Initialize and return the LLM model using LangChain 0.3."""
    return ChatOpenAI(
        base_url="http://localhost:1234/v1",
        api_key="LMStudio",
        model_name="qwen2.5-7b-instruct-1m",
        temperature=0.1
    )

def create_extraction_chain():
    """Create a LangChain chain for extracting detailed MIC information."""
    # Initialize components
    llm = init_llm()
    parser = PydanticOutputParser(pydantic_object=MICDetailedInfo)
    format_instructions = parser.get_format_instructions()
    
    prompt = PromptTemplate(
        template="""You are an expert analyst of international relations and military conflicts.
        
        This article has been identified as describing a Militarized Interstate Confrontation (MIC).
        
        Extract the following specific details about this confrontation:
        1. The date when the confrontation occurred (be as precise as possible, use YYYY-MM-DD format if date is known else return 0000-00-00) 
        2. The number of fatalities (provide a range with minimum and maximum values; use the same number for both if precise)
        3. All countries involved in the confrontation
        4. If possible, identify which country initiated the confrontation and which was the target
        
        If any information is not explicitly stated in the article, make your best estimate based on context clues.
        If you cannot determine a piece of information at all, use null for that field.

        Example 1:
        Article: "On March 15, 2022, tensions escalated between Nation A and Nation B, leading to armed skirmishes. Reports confirm at least 50 casualties."
        
        Extracted Details:
        ```json
        {{
            "MICdate": "2022-03-15",
            "fatality_min": 50,
            "fatality_max": 50,
            "countries_involved": ["Nation A", "Nation B"],
            "initiator_country": "Nation A",
            "target_country": "Nation B"
        }}
        ```
        
        Example 2:
        Article: "In early 1998, a naval conflict arose between Country X and Country Y. The exact number of casualties remains unknown."
        
        Extracted Details:
        ```
        {{
            "MICdate": "1998-01-01",
            "fatality_min": 0,
            "fatality_max": 0,
            "countries_involved": ["Country X", "Country Y"],
            "initiator_country": null,
            "target_country": null
        }}
        ```
        
        Article:
        {article}
        
        {format_instructions}
        """,
        input_variables=['article'],
        partial_variables={'format_instructions': format_instructions}
    )
    
    # Build the LCEL chain
    chain = prompt | llm | parser
    
    return chain

def read_article_file(file_path: Path) -> Optional[str]:
    """Read an article file with proper error handling for encodings."""
    try:
        # Try UTF-8 first
        return file_path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        try:
            # Fall back to Latin-1
            return file_path.read_text(encoding="latin-1")
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
            return None

def extract_mic_details(chain, article_text: str) -> MICDetailedInfo:
    """Extract detailed information from the article using the provided chain."""
    try:
        return chain.invoke({"article": article_text})
    except Exception as e:
        # Return a default response if parsing fails
        return MICDetailedInfo(MICdate="0000-00-00", fatality_min=0, fatality_max=0, countries_involved=[], initiator_country=None, target_country=None)
    
def process_mic_articles(base_dir: Path, classified_dir: Path, output_dir: Path, years_to_process: list[str]) -> None:
    """Process MIC-classified articles to extract detailed information and save using CSV writer"""
    
    output_dir.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists
    extraction_chain = create_extraction_chain()   # Initialize extraction chain
    
    # Process each year
    for year in years_to_process:
        csv_file = classified_dir / f"{year}_classification.csv"
        output_file = output_dir / f"{year}_mic_details.csv"

        if not csv_file.exists():
            print(f"Classification file for {year} not found. Skipping.")
            continue
        
        # Read classified data and filter MIC articles
        df = pd.read_csv(csv_file)
        mic_articles = df[df['Label'] == 1]
        
        if mic_articles.empty:
            print(f"No MIC articles found for {year}. Skipping.")
            continue
        
        print(f"Found {len(mic_articles)} MIC articles for {year}.")

        # Load existing processed entries to avoid duplication
        existing_entries = set()
        if output_file.exists():
            with open(output_file, mode="r", newline="", encoding="utf-8") as f:
                reader = csv.reader(f)
                next(reader, None)  # Skip header
                for row in reader:
                    if row:
                        existing_entries.add(row[0])  # Store Index values
        
        file_mode = "a" if output_file.exists() else "w"
        with open(output_file, mode=file_mode, newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            
            # Write headers if it's a new file
            if file_mode == "w":
                writer.writerow([
                    'Index', 'MICdate', 'Fatality_Min', 'Fatality_Max', 
                    'Countries_Involved', 'Initiator_Country', 'Target_Country'
                ])

            # Process each article
            for _, row in tqdm(mic_articles.iterrows(), total=len(mic_articles), desc=f"Extracting details for {year}"):
                file_index = row['Index']
                
                # Skip if already processed
                if file_index in existing_entries:
                    continue
                
                # Extract article filename and path
                article_filename = file_index.split('_', 1)[1]
                article_path = base_dir / year / article_filename
                
                # Read the article content
                content = read_article_file(article_path)
                if content is None:
                    continue
                
                # Extract details
                details = extract_mic_details(extraction_chain, content)
                
                # Write extracted details to CSV
                writer.writerow([
                    file_index,
                    details.date,
                    details.fatality_min,
                    details.fatality_max,
                    ', '.join(details.countries_involved),
                    details.initiator_country,
                    details.target_country
                ])
        
        print(f"Saved detailed information for {len(mic_articles)} MIC articles from {year}.")

In [6]:
base_dir = Path.cwd().parent / "processed_files"
classified_dir = Path.cwd().parent / "classified_files"
detailed_dir = Path.cwd().parent / "detailed_files"

process_mic_articles(base_dir, classified_dir, detailed_dir, ["2008"])

Found 304 MIC articles for year 2008.


Extracting details for 2008:   0%|          | 0/304 [00:00<?, ?it/s]

Saved detailed information for 304 MIC articles from 2008.
