In [13]:
# --- 1. Load Processed Data ---
import pandas as pd
import os
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm

print("--- Step 1: Loading Data ---")
DATA_DIR = '../data'
# Loading the complete preprocessed data, not the filtered version
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, '02_processed')

df_eng = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_eng_processed.pkl'))
df_ara = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_ara_processed.pkl'))
print("Full preprocessed data loaded successfully.")
print(f"  - English: {len(df_eng):,} articles")
print(f"  - Arabic:  {len(df_ara):,} articles")
print("-" * 30, "\n")


# --- 2. Load Risk Factors ---
print("--- Step 2: Loading Risk Factors ---")
risk_factors_path = os.path.join(DATA_DIR, '01_raw/risk-factors.xlsx')
df_risk_factors_eng = pd.read_excel(risk_factors_path)
df_risk_factors_eng.dropna(subset=['risk_factor_english'], inplace=True)
risk_factor_labels = df_risk_factors_eng['risk_factor_english'].tolist()
print(f"Loaded {len(risk_factor_labels)} English risk factors.")
print("-" * 30, "\n")


# --- 3. Initialize Models ---
print("--- Step 3: Initializing Models ---")
device = 0 if torch.cuda.is_available() else -1
if device == 0:
    print("GPU found. Models will run on the GPU for maximum speed.")
else:
    print("No GPU found. Models will run on the CPU.")

MODEL_NAME = 'MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33'
classifier = pipeline("zero-shot-classification", model=MODEL_NAME, device=device)
print(f"Main classifier initialized: {MODEL_NAME}")

FAST_MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
fast_embedder = SentenceTransformer(FAST_MODEL_NAME, device=device)
print(f"Fast pre-filtering model initialized: {FAST_MODEL_NAME}")
print("-" * 30, "\n")


# --- 4. Define the Extraction Function ---
print("--- Step 4: Defining the Extraction Function ---")
risk_factor_embeddings = fast_embedder.encode(risk_factor_labels, convert_to_tensor=True)
print("Risk factor embeddings pre-computed.")

def extract_risk_factors_optimized(
    df, classifier, labels, threshold, batch_size,
    sentence_embedder, risk_factor_embeddings, sentence_similarity_threshold
):
    if 'article_id' not in df.columns:
        df['article_id'] = df.index
    df_sentences = df.explode('sentences').rename(columns={'sentences': 'sentence_text'})
    df_sentences = df_sentences[['article_id', 'date', 'sentence_text']].dropna(subset=['sentence_text'])
    all_sentences = df_sentences['sentence_text'].tolist()
    if not all_sentences: return pd.DataFrame()

    print(f"\nOriginal sentence count: {len(all_sentences):,}")
    print(f"Pre-filtering sentences with threshold: {sentence_similarity_threshold}")
    sentence_embeddings = sentence_embedder.encode(all_sentences, convert_to_tensor=True, show_progress_bar=True)
    hits = util.semantic_search(sentence_embeddings, risk_factor_embeddings, top_k=1)
    relevant_indices = [i for i, hit_list in enumerate(hits) if hit_list and hit_list[0]['score'] >= sentence_similarity_threshold]
    filtered_sentences_df = df_sentences.iloc[relevant_indices]
    sentence_list = filtered_sentences_df['sentence_text'].tolist()
    if not sentence_list: return pd.DataFrame()

    print(f"Reduced to {len(sentence_list):,} sentences after filtering.")
    print(f"Running classifier with confidence threshold: {threshold}")
    results_list = []
    for i, result in tqdm(enumerate(classifier(sentence_list, labels, multi_label=True, batch_size=batch_size)), total=len(sentence_list)):
        for label, score in zip(result['labels'], result['scores']):
            if score >= threshold:
                original_row = filtered_sentences_df.iloc[i]
                results_list.append({
                    'article_id': original_row['article_id'],
                    'date': original_row['date'],
                    'sentence_text': result['sequence'],
                    'risk_factor': label,
                    'confidence_score': score
                })
    return pd.DataFrame(results_list)

# --- 5. Set Parameters and Run on a SAMPLE ---
print("\n--- Step 5: Running on a SAMPLE ---")
CLASSIFIER_BATCH_SIZE = 128
SENTENCE_SIMILARITY_THRESHOLD = 0.55
CLASSIFIER_CONFIDENCE_THRESHOLD = 0.90

# Create samples to test the pipeline
df_eng_sample = df_eng.head(100).copy()
df_ara_sample = df_ara.head(100).copy()
df_sample = pd.concat([df_eng_sample, df_ara_sample], ignore_index=True)

print(f"Processing a sample of {len(df_sample):,} articles...")

all_risk_mentions = extract_risk_factors_optimized(
    df_sample,
    classifier,
    risk_factor_labels,
    threshold=CLASSIFIER_CONFIDENCE_THRESHOLD,
    batch_size=CLASSIFIER_BATCH_SIZE,
    sentence_embedder=fast_embedder,
    risk_factor_embeddings=risk_factor_embeddings,
    sentence_similarity_threshold=SENTENCE_SIMILARITY_THRESHOLD
)
print("\nSample risk factor extraction complete.")
print(f"Found {len(all_risk_mentions):,} potential risk mentions in the sample.")
print("-" * 30, "\n")


# --- 6. Post-Processing: Keep Only the Top Label per Sentence ---
print("--- Step 6: Refining Sample Results (Post-Processing) ---")
if not all_risk_mentions.empty:
    print(f"Original number of mentions: {len(all_risk_mentions):,}")
    # For each sentence, find the index of the row with the max confidence score
    idx = all_risk_mentions.groupby('sentence_text')['confidence_score'].idxmax()
    # Use the index to select the corresponding rows
    all_risk_mentions_refined = all_risk_mentions.loc[idx]
    print(f"Refined to {len(all_risk_mentions_refined):,} high-confidence, unique mentions.")
else:
    all_risk_mentions_refined = pd.DataFrame() # Create empty df if no mentions were found
    print("No risk mentions found to refine.")
print("-" * 30, "\n")


# --- 7. Save the SAMPLE Results ---
print("--- Step 7: Saving SAMPLE Results ---")
OUTPUT_DIR = os.path.join(DATA_DIR, '03_models')
os.makedirs(OUTPUT_DIR, exist_ok=True)

output_path = os.path.join(OUTPUT_DIR, 'risk_mentions_SAMPLE_FINAL.csv')
all_risk_mentions_refined.to_csv(output_path, index=False)
print(f"Successfully saved {len(all_risk_mentions_refined):,} final sample risk mentions to: {output_path}")
print("-" * 30, "\n")

print("--- Final Extracted Risk Factors (from Sample) ---")
if all_risk_mentions_refined.empty:
    print("No risk factors found with the current settings.")
else:
    display(all_risk_mentions_refined.head())

--- Step 1: Loading Data ---


Full preprocessed data loaded successfully.
  - English: 86,660 articles
  - Arabic:  85,511 articles
------------------------------ 

--- Step 2: Loading Risk Factors ---
Loaded 167 English risk factors.
------------------------------ 

--- Step 3: Initializing Models ---
GPU found. Models will run on the GPU for maximum speed.


Device set to use cuda:0


Main classifier initialized: MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33
Fast pre-filtering model initialized: paraphrase-multilingual-MiniLM-L12-v2
------------------------------ 

--- Step 4: Defining the Extraction Function ---
Risk factor embeddings pre-computed.

--- Step 5: Running on a SAMPLE ---
Processing a sample of 200 articles...

Original sentence count: 13,394
Pre-filtering sentences with threshold: 0.55


Batches:   0%|          | 0/419 [00:00<?, ?it/s]

Reduced to 1,001 sentences after filtering.
Running classifier with confidence threshold: 0.9


  0%|          | 0/1001 [00:00<?, ?it/s]


Sample risk factor extraction complete.
Found 3,345 potential risk mentions in the sample.
------------------------------ 

--- Step 6: Refining Sample Results (Post-Processing) ---
Original number of mentions: 3,345
Refined to 572 high-confidence, unique mentions.
------------------------------ 

--- Step 7: Saving SAMPLE Results ---
Successfully saved 572 final sample risk mentions to: ../data/03_models/risk_mentions_SAMPLE_FINAL.csv
------------------------------ 

--- Final Extracted Risk Factors (from Sample) ---


Unnamed: 0,article_id,date,sentence_text,risk_factor,confidence_score
3250,169,2024-06-27,""".",without international aid,0.960398
1365,37,2024-06-29,"""African narco-jihadism among Al Qaeda and Isl...",jihadist groups,0.998151
1778,62,2024-06-28,"""Al-Baghdadi´s sermon - an extension of the ex...",jihadist groups,0.993771
1393,37,2024-06-29,"""Counternetwork.",blockade,0.99657
1325,37,2024-06-29,"""El Sahel: epicentro yihadista en África Occid...",drought,0.983723
