# Causality mining using DSPY and LLAMA

In [None]:
pip install dspy

In [6]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\amahai00\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [2]:
import dspy

In [3]:
from typing import Literal

In [7]:
import pandas as pd
import openpyxl

In [12]:
who_data = pd.read_csv("../data/raw/corpus.csv")
who_data_epi = who_data[who_data["InformationType"] == "Epidemiology"]
who_data_assessment = who_data[who_data["InformationType"] == "Assessment"]
who_data_epi_and_assessment = who_data[
    who_data["InformationType"].isin(["Epidemiology", "Assessment"])
]

In [13]:
who_data_assessment.head()

Unnamed: 0,DonID_standardized,UrlName,DonId,InformationType,Text
3,2024-DON540,2024-DON540,2024-DON540,Assessment,Marburg virus disease (MVD) is caused by the s...
9,2024-DON538,2024-DON538,2024-DON538,Assessment,Although\nno cases of WNV have been documented...
15,2024-DON536,2024-DON536,2024-DON536,Assessment,Since the first report of MERS-CoV in the King...
21,2024-DON537,2024-DON537,2024-DON537,Assessment,Marburg virus disease (MVD) is caused by the s...
27,2024-DON534,2024-DON534,2024-DON534,Assessment,This is the first human case of infection with...


In [8]:
# import 3.Clustering Drivers.xlsx, sheet "Sheet1"
driver_cat = pd.read_excel(
    "../data/raw/clustering_drivers.xlsx", sheet_name="V2_Peter_PSJ_EP_PSJ_EP"
)

In [None]:
# Sort column "Consolidated Name" and remove duplicates in that column, write in method chaining and save to new object
driver_cat_rm_dups = driver_cat.drop_duplicates(subset=["Group", "Consolidated Name"])

predefined_driver_category = (
    driver_cat_rm_dups.groupby("Group")["Consolidated Name"]
    .apply(lambda x: x.dropna().unique().tolist())
    .to_dict()
)

# Print the result
print(predefined_driver_category)

In [6]:
lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434')
dspy.configure(lm=lm)

In [None]:
# Define a module (ChainOfThought) and assign it a signature (return an answer, given a question).
qa = dspy.ChainOfThought('question -> answer')

# Run with the default LM configured with `dspy.configure` above.
response = qa(question="What is the capital city of Sweden?")
print(response.answer)

Stockholm


In [13]:
# Define a ChainOfThought module with a valid signature
extract_causal_parts = dspy.ChainOfThought(
    "text -> cause, effect"
)

# Example usage
text = "High mobility in urban areas has facilitated the rapid spread of the virus."
response = extract_causal_parts(text=text)

# Print the extracted cause and effect
print("Cause:", response['cause'])
print("Effect:", response['effect'])

Cause: Human movement and interaction in densely populated cities.
Effect: Increased transmission of the virus among a large population.


In [14]:
texts = [
    "High mobility in urban areas has facilitated the rapid spread of the virus.",
    "Deforestation leads to increased interaction between humans and wildlife, causing outbreaks."
]

for text in texts:
    response = extract_causal_parts(text=text)
    print(f"Text: {text}")
    print("Cause:", response['cause'])
    print("Effect:", response['effect'])
    print("-" * 50)


Text: High mobility in urban areas has facilitated the rapid spread of the virus.
Cause: Human movement and interaction in densely populated cities.
Effect: Increased transmission of the virus among a large population.
--------------------------------------------------
Text: Deforestation leads to increased interaction between humans and wildlife, causing outbreaks.
Cause: Human activities such as agriculture, urbanization, and logging.
Effect: Outbreaks of zoonotic diseases, conflicts with wildlife, and loss of biodiversity.
--------------------------------------------------


In [15]:
print("Reasoning:", response.reasoning)

Reasoning: Deforestation is a human activity that leads to habitat loss and fragmentation, resulting in increased interaction between humans and wildlife.


In [18]:
class CausalExtraction(dspy.Signature):
    """Extract cause, effect, and additional metadata from a text."""

    # Input field
    text: str = dspy.InputField()

    # Output fields
    cause: str = dspy.OutputField()
    effect: str = dspy.OutputField()
    causality_type: Literal['intra-sentence', 'inter-sentence'] = dspy.OutputField()
    confidence: float = dspy.OutputField()  # Confidence score for the extraction

# Define a Predict class using the extended signature
extract_causal_parts = dspy.Predict(CausalExtraction)

# Example usage
text = "Deforestation leads to increased interaction between humans and wildlife, which can cause disease outbreaks."
response = extract_causal_parts(text=text)

# Print the extracted fields
print("Cause:", response.cause)
print("Effect:", response.effect)
print("Causality Type:", response.causality_type)
print("Confidence Score:", response.confidence)

Cause: Deforestation
Effect: Disease outbreaks
Causality Type: inter-sentence
Confidence Score: 0.8


In [19]:
texts = [
    "High mobility in urban areas has facilitated the rapid spread of the virus.",
    "Deforestation leads to increased interaction between humans and wildlife, which can cause disease outbreaks.",
    "Climate change is exacerbating extreme weather events, leading to widespread flooding."
]

for text in texts:
    response = extract_causal_parts(text=text)
    print(f"Text: {text}")
    print("Cause:", response.cause)
    print("Effect:", response.effect)
    print("Causality Type:", response.causality_type)
    print("Confidence Score:", response.confidence)
    print("-" * 50)


Text: High mobility in urban areas has facilitated the rapid spread of the virus.
Cause: High mobility in urban areas
Effect: Rapid spread of the virus
Causality Type: inter-sentence
Confidence Score: 0.8
--------------------------------------------------
Text: Deforestation leads to increased interaction between humans and wildlife, which can cause disease outbreaks.
Cause: Deforestation
Effect: Disease outbreaks
Causality Type: inter-sentence
Confidence Score: 0.8
--------------------------------------------------
Text: Climate change is exacerbating extreme weather events, leading to widespread flooding.
Cause: Climate change
Effect: extreme weather events and widespread flooding
Causality Type: inter-sentence
Confidence Score: 0.9
--------------------------------------------------


In [22]:
class CausalExtractionPrompt(dspy.Signature):
    """
    Signature for extracting causes and effects with additional metadata.
    Dynamically generates prompts based on included components.
    """
    # Input fields
    text: str = dspy.InputField()
    include_persona: bool = dspy.InputField(default=False)
    include_domain: bool = dspy.InputField(default=False)
    include_causality: bool = dspy.InputField(default=True)
    include_guidance: bool = dspy.InputField(default=True)
    include_examples: bool = dspy.InputField(default=False)
    include_negative: bool = dspy.InputField(default=False)
    include_mechanism: bool = dspy.InputField(default=False)
    include_sign: bool = dspy.InputField(default=False)

    # Output fields
    cause: str = dspy.OutputField()
    effect: str = dspy.OutputField()
    causality_type: Literal['intra-sentence', 'inter-sentence'] = dspy.OutputField()
    confidence: float = dspy.OutputField()

    def generate_prompt(self) -> str:
        """
        Generate a dynamic prompt based on the specified input fields.
        """
        prompt = ""

        # Add selected components to the prompt
        if self.include_persona:
            prompt += """
            You are an epidemiologist tasked with identifying sentences or phrases from outbreak reports that describe the drivers or contributors to the emergence or transmission of pests and pathogens.
            """

        if self.include_domain:
            prompt += """
            Here is the definition of the DPSIR (Drivers, Pressure, State, Impacts, and Responses) framework:
            - Drivers: Underlying socio-economic, environmental, or ecological forces that create conditions favorable for disease emergence.
            - Pressure: Human activities responsible for spillover events and pathogen transmission.
            - State: The current circulation status of pests or pathogens.
            - Impacts: Effects on individuals, communities, and socio-economic conditions.
            - Responses: Actions by governments to manage drivers, pressures, and impacts.
            """

        if self.include_causality:
            prompt += """
            Causality can take two forms:
            - Intra-sentence causality: The cause and effect are within a single sentence.
            - Inter-sentence causality: The cause and effect span multiple sentences.
            """

        if self.include_guidance:
            prompt += """
            Input text: {text}
            Expected output:
            1. Raw text with marked causes and effects
            2. Extracted causes and effects:
               - Cause: [cause] -> Effect: [effect]
               - Type: [intra-sentence/inter-sentence]
            """

        if self.include_examples:
            prompt += """
            Examples:
            - (C1) High population density (C1) -> (E1) Rapid spread of the virus (E1)
            - Multiple causes: (C1) Poverty (C1) and (C2) Poor sanitation (C2) -> (E1) Increased outbreaks (E1)
            """

        if self.include_negative:
            prompt += """
            Avoid irrelevant causality:
            - Example: "MVD is a highly virulent disease" is not causality related to disease transmission.
            """

        if self.include_mechanism:
            prompt += """
            Mechanisms describe physiological interactions between the host, pathogen, and environment that drive transmission or emergence.
            """

        if self.include_sign:
            prompt += """
            For each cause and effect, indicate whether it is positive or negative. Positive: improve, facilitate, enable. Negative: reduce, hinder, impair.
            """

        # Add the input text dynamically
        prompt += f"\nText to analyze: {self.text}\n"
        return prompt

# Define a Predict class using the dynamic prompt
extract_causal_parts = dspy.Predict(CausalExtractionPrompt)

# Example usage
text = """
Deforestation increases human-wildlife interaction, leading to the emergence of zoonotic diseases.
In rural areas, limited healthcare access exacerbates the impact of outbreaks. Climate change has amplified
the frequency of extreme weather events, which in turn disrupts ecosystems and increases vulnerability
to infectious diseases. Urbanization and high population density further accelerate the spread of viruses.
"""
response = extract_causal_parts(
    text=text,
    include_persona=True,
    include_domain=False,        # Explicitly pass all fields
    include_causality=True,
    include_guidance=True,
    include_examples=True,
    include_negative=False,
    include_mechanism=False,
    include_sign=False,
)


# Print the extracted fields
print("Cause:", response.cause)
print("Effect:", response.effect)
print("Causality Type:", response.causality_type)
print("Confidence Score:", response.confidence)


Cause: Deforestation
Effect: Zoonotic diseases
Causality Type: inter-sentence
Confidence Score: 0.8


In [30]:
from typing import TypedDict, List, Dict


class CausalPair(TypedDict):
    cause: str
    effect: str
    relationship_type: Literal['primary', 'intermediate']
    confidence: float

class MultiCausalExtractionPrompt(dspy.Signature):
    """
    Signature for extracting multiple cause-effect pairs from text, including primary/intermediate relationships
    and confidence scores.
    """

    # Input fields
    text: str = dspy.InputField()
    include_persona: bool = dspy.InputField(default=False)
    include_domain: bool = dspy.InputField(default=False)
    include_causality: bool = dspy.InputField(default=True)
    include_guidance: bool = dspy.InputField(default=True)
    include_examples: bool = dspy.InputField(default=False)

    # Output field: A list of cause-effect pairs with metadata
    causal_pairs: List[CausalPair] = dspy.OutputField()

    def generate_prompt(self) -> str:
        """
        Generate a dynamic prompt to extract multiple cause-effect pairs with confidence scores.
        """
        prompt = ""

        if getattr(self, "include_persona", False):
            prompt += (
                "You are an epidemiologist tasked with identifying causality in the text.\n"
            )
        if getattr(self, "include_domain", False):
            prompt += (
                "Here is the DPSIR framework definition: Drivers, Pressures, States, Impacts, and Responses...\n"
            )
        if getattr(self, "include_causality", True):
            prompt += (
                "Causality can take two forms:\n"
                "1. Primary relationships: Direct cause-effect pairs.\n"
                "2. Intermediate relationships: Indirect cause-effect pairs that link primary causes and final effects.\n"
            )
        if getattr(self, "include_guidance", True):
            prompt += (
                "Analyze the following text and extract all cause-effect pairs with confidence scores:\n"
                f"{self.text}\n\n"
                "Output format:\n"
                "- Cause: [cause]\n"
                "- Effect: [effect]\n"
                "- Relationship Type: [primary/intermediate]\n"
                "- Confidence: [confidence score (0-1)]\n"
            )
        if getattr(self, "include_examples", False):
            prompt += (
                "Examples:\n"
                "- Cause: Deforestation\n"
                "  Effect: Increased human-wildlife interaction\n"
                "  Relationship Type: Primary\n"
                "  Confidence: 0.95\n"
                "- Cause: Increased human-wildlife interaction\n"
                "  Effect: Emergence of zoonotic diseases\n"
                "  Relationship Type: Intermediate\n"
                "  Confidence: 0.90\n"
            )
        return prompt

# Define the prediction model using the new signature
extract_causal_pairs = dspy.ChainOfThought(MultiCausalExtractionPrompt)


# Test with a larger text
response = extract_causal_pairs(
    text="""
    Deforestation increases human-wildlife interaction, leading to the emergence of zoonotic diseases.
    In rural areas, limited healthcare access exacerbates the impact of outbreaks. Climate change has amplified
    the frequency of extreme weather events, which in turn disrupts ecosystems and increases vulnerability
    to infectious diseases. Urbanization and high population density further accelerate the spread of viruses.
    """,
    include_persona=True,       # Explicitly enable persona
    include_domain=False,       # Explicitly disable domain (default: False)
    include_causality=True,     # Explicitly enable causality (default: True)
    include_guidance=True,      # Explicitly enable guidance (default: True)
    include_examples=True,      # Explicitly enable examples
)


# Display the extracted cause-effect pairs
for pair in response.causal_pairs:
    print("Cause:", pair["cause"])
    print("Effect:", pair["effect"])
    print("Relationship Type:", pair.get("relationship_type", "Unknown"))
    print("Confidence Score:", pair.get("confidence", "N/A"))  # Fallback to 'N/A'
    print("-" * 50)


Cause: deforestation
Effect: human-wildlife interaction
Relationship Type: primary
Confidence Score: 0.8
--------------------------------------------------
Cause: climate change
Effect: disruption of ecosystems
Relationship Type: intermediate
Confidence Score: 0.7
--------------------------------------------------
Cause: urbanization
Effect: acceleration of virus spread
Relationship Type: primary
Confidence Score: 0.9
--------------------------------------------------
