In [1]:
import pandas as pd
import os
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm

# --- 1. Load GEOGRAPHICALLY FILTERED Data ---
print("--- Step 1: Loading Geographically Filtered Data ---")
DATA_DIR = '../data'
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, '02_processed')

filtered_data_path = os.path.join(PROCESSED_DATA_DIR, 'news_geographically_filtered.pkl')
df_filtered = pd.read_pickle(filtered_data_path)

print("Geographically filtered data loaded successfully.")
print(f"  - Total relevant articles: {len(df_filtered):,} articles")
print("-" * 30, "\n")


# --- 2. Load Risk Factors ---
print("--- Step 2: Loading Risk Factors ---")
risk_factors_path = os.path.join(DATA_DIR, '01_raw/risk-factors.xlsx')
df_risk_factors_eng = pd.read_excel(risk_factors_path)
df_risk_factors_eng.dropna(subset=['risk_factor_english'], inplace=True)
risk_factor_labels = df_risk_factors_eng['risk_factor_english'].tolist()
print(f"Loaded {len(risk_factor_labels)} English risk factors.")
print("-" * 30, "\n")


# --- 3. Initialize Models ---
print("--- Step 3: Initializing Models ---")
device = 0 if torch.cuda.is_available() else -1
if device == 0:
    print("GPU found. Models will run on the GPU for maximum speed.")
else:
    print("No GPU found. Models will run on the CPU.")

MODEL_NAME = 'MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33'
classifier = pipeline("zero-shot-classification", model=MODEL_NAME, device=device)
print(f"Main classifier initialized: {MODEL_NAME}")

FAST_MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
fast_embedder = SentenceTransformer(FAST_MODEL_NAME, device=device)
print(f"Fast pre-filtering model initialized: {FAST_MODEL_NAME}")
print("-" * 30, "\n")


# --- 4. Define the Extraction Function ---
# (No changes needed in this section)
print("--- Step 4: Defining the Extraction Function ---")
risk_factor_embeddings = fast_embedder.encode(risk_factor_labels, convert_to_tensor=True)
print("Risk factor embeddings pre-computed.")

def extract_risk_factors_optimized(
    df, classifier, labels, threshold, batch_size,
    sentence_embedder, risk_factor_embeddings, sentence_similarity_threshold
):
    if 'article_id' not in df.columns:
        df['article_id'] = df.index
    df_sentences = df.explode('sentences').rename(columns={'sentences': 'sentence_text'})
    df_sentences = df_sentences[['article_id', 'date', 'sentence_text']].dropna(subset=['sentence_text'])
    all_sentences = df_sentences['sentence_text'].tolist()
    if not all_sentences: return pd.DataFrame()

    print(f"\nOriginal sentence count: {len(all_sentences):,}")
    print(f"Pre-filtering sentences with threshold: {sentence_similarity_threshold}")
    sentence_embeddings = sentence_embedder.encode(all_sentences, convert_to_tensor=True, show_progress_bar=True)
    hits = util.semantic_search(sentence_embeddings, risk_factor_embeddings, top_k=1)
    relevant_indices = [i for i, hit_list in enumerate(hits) if hit_list and hit_list[0]['score'] >= sentence_similarity_threshold]
    filtered_sentences_df = df_sentences.iloc[relevant_indices]
    sentence_list = filtered_sentences_df['sentence_text'].tolist()
    if not sentence_list: return pd.DataFrame()

    print(f"Reduced to {len(sentence_list):,} sentences after filtering.")
    print(f"Running classifier with confidence threshold: {threshold}")
    results_list = []
    for i, result in tqdm(enumerate(classifier(sentence_list, labels, multi_label=True, batch_size=batch_size)), total=len(sentence_list)):
        for label, score in zip(result['labels'], result['scores']):
            if score >= threshold:
                original_row = filtered_sentences_df.iloc[i]
                results_list.append({
                    'article_id': original_row['article_id'],
                    'date': original_row['date'],
                    'sentence_text': result['sequence'],
                    'risk_factor': label,
                    'confidence_score': score
                })
    return pd.DataFrame(results_list)

# --- 5. Set Parameters and Run on a SAMPLE (CHANGED) ---
print("\n--- Step 5: Running on a SAMPLE of 10 Articles ---")
CLASSIFIER_BATCH_SIZE = 128
SENTENCE_SIMILARITY_THRESHOLD = 0.55
CLASSIFIER_CONFIDENCE_THRESHOLD = 0.90

# Create a small sample to test the pipeline
df_sample = df_filtered.head(10).copy()

print(f"Processing a sample of {len(df_sample):,} articles...")

all_risk_mentions = extract_risk_factors_optimized(
    df_sample,  # Use the sample DataFrame
    classifier,
    risk_factor_labels,
    threshold=CLASSIFIER_CONFIDENCE_THRESHOLD,
    batch_size=CLASSIFIER_BATCH_SIZE,
    sentence_embedder=fast_embedder,
    risk_factor_embeddings=risk_factor_embeddings,
    sentence_similarity_threshold=SENTENCE_SIMILARITY_THRESHOLD
)
print("\nSample risk factor extraction complete.")
print(f"Found {len(all_risk_mentions):,} potential risk mentions in the sample.")
print("-" * 30, "\n")


# --- 6. Post-Processing: Keep Only the Top Label per Sentence ---
print("--- Step 6: Refining Sample Results (Post-Processing) ---")
if not all_risk_mentions.empty:
    print(f"Original number of mentions: {len(all_risk_mentions):,}")
    idx = all_risk_mentions.groupby('sentence_text')['confidence_score'].idxmax()
    all_risk_mentions_refined = all_risk_mentions.loc[idx]
    print(f"Refined to {len(all_risk_mentions_refined):,} high-confidence, unique mentions.")
else:
    all_risk_mentions_refined = pd.DataFrame()
    print("No risk mentions found to refine.")
print("-" * 30, "\n")


# --- 7. Save the SAMPLE Results (CHANGED) ---
print("--- Step 7: Saving SAMPLE Results ---")
OUTPUT_DIR = os.path.join(DATA_DIR, '03_models')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save the sample results to a separate file
output_path = os.path.join(OUTPUT_DIR, 'risk_mentions_SAMPLE_FINAL.csv')
all_risk_mentions_refined.to_csv(output_path, index=False)
print(f"Successfully saved {len(all_risk_mentions_refined):,} sample risk mentions to: {output_path}")
print("-" * 30, "\n")

print("--- Final Extracted Risk Factors (from Sample) ---")
if all_risk_mentions_refined.empty:
    print("No risk factors found with the current settings.")
else:
    display(all_risk_mentions_refined.head())

--- Step 1: Loading Geographically Filtered Data ---
Geographically filtered data loaded successfully.
  - Total relevant articles: 96,516 articles
------------------------------ 

--- Step 2: Loading Risk Factors ---
Loaded 167 English risk factors.
------------------------------ 

--- Step 3: Initializing Models ---
GPU found. Models will run on the GPU for maximum speed.


Device set to use cuda:0


Main classifier initialized: MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33
Fast pre-filtering model initialized: paraphrase-multilingual-MiniLM-L12-v2
------------------------------ 

--- Step 4: Defining the Extraction Function ---
Risk factor embeddings pre-computed.

--- Step 5: Running on a SAMPLE of 10 Articles ---
Processing a sample of 10 articles...

Original sentence count: 1,294
Pre-filtering sentences with threshold: 0.55


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Reduced to 80 sentences after filtering.
Running classifier with confidence threshold: 0.9


  0%|          | 0/80 [00:00<?, ?it/s]


Sample risk factor extraction complete.
Found 188 potential risk mentions in the sample.
------------------------------ 

--- Step 6: Refining Sample Results (Post-Processing) ---
Original number of mentions: 188
Refined to 45 high-confidence, unique mentions.
------------------------------ 

--- Step 7: Saving SAMPLE Results ---
Successfully saved 45 sample risk mentions to: ../data/03_models/risk_mentions_SAMPLE_FINAL.csv
------------------------------ 

--- Final Extracted Risk Factors (from Sample) ---


Unnamed: 0,article_id,date,sentence_text,risk_factor,confidence_score
87,6,2024-07-15,A progressive U.S.,rise,0.950273
136,6,2024-07-15,And the Middle East presents a special challen...,conflict,0.993686
101,6,2024-07-15,Create direct American diplomatic channels wit...,conflict,0.996039
172,8,2024-07-03,Even maverick figures such as Muqtada al-Sadr ...,conflict,0.997864
153,6,2024-07-15,Fund an international climate finance agency.,call for donations,0.986623
