In [2]:
import pandas as pd
import os
from transformers import pipeline
import torch
import pickle
import numpy as np
from tqdm.auto import tqdm

# --- Step 1: Loading All Necessary Data & SAMPLING ---
print("# Step 1: Loading All Necessary Data & SAMPLING")
DATA_DIR = '../data'
MODELS_DIR = os.path.join(DATA_DIR, '03_models')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, '02_processed')

df_risk_sentences = pd.read_csv(os.path.join(MODELS_DIR, 'risk_mentions_SAMPLE_FINAL.csv'))

# --- THIS IS THE NEW LINE FOR SAMPLING ---
# Process only the first 100 rows for quick testing
df_risk_sentences = df_risk_sentences.head(100).copy()
print(f"--- RUNNING ON A SAMPLE of {len(df_risk_sentences)} rows ---")
# -----------------------------------------

df_eng = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_eng_processed.pkl'))
df_eng['language'] = 'english'
df_ara = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_ara_processed.pkl'))
df_ara['language'] = 'arabic'
if 'article_id' not in df_eng: df_eng['article_id'] = df_eng.index
if 'article_id' not in df_ara: df_ara['article_id'] = df_ara.index
df_articles = pd.concat([df_eng, df_ara])
with open('../data/01_raw/id_english_location_name.pkl', 'rb') as f: eng_locations = pickle.load(f)
with open('../data/01_raw/id_arabic_location_name.pkl', 'rb') as f: ara_locations = pickle.load(f)
print("All data loaded successfully.")
print("-" * 30, "\n")


# --- Step 2: Building Location Resolvers ---
print("# Step 2: Building Location Resolvers")
def create_name_to_id_lookup(location_dict):
    lookup = {}
    for loc_id, names in location_dict.items():
        for name in names: lookup[name.lower()] = loc_id
    return lookup
location_lookup = create_name_to_id_lookup(eng_locations)
location_lookup.update(create_name_to_id_lookup(ara_locations))
id_to_english_name_lookup = {loc_id: names[0] for loc_id, names in eng_locations.items()}
print("Location resolvers created.")
print("-" * 30, "\n")


# --- Step 3: Initialize Hugging Face NER Pipeline ---
print("# Step 3: Loading Hugging Face Model for NER")
device = 0 if torch.cuda.is_available() else -1
if device == 0: print("GPU found.")
else: print("No GPU found.")
ner_pipeline = pipeline("ner", model="Babelscape/wikineural-multilingual-ner", aggregation_strategy="simple", device=device)
print("NER pipeline loaded.")
print("-" * 30, "\n")


# --- Step 4: Hybrid Geotagging (Optimized with Batching) ---
print("# Step 4: Hybrid Geotagging (Optimized with Batching)")
articles_with_risks_ids = df_risk_sentences['article_id'].unique()
df_articles_with_risks = df_articles[df_articles['article_id'].isin(articles_with_risks_ids)][['article_id', 'body', 'language']].copy()
print(f"Found {len(df_articles_with_risks)} unique articles for context.")

# Helper function to resolve locations from pre-computed entities
def resolve_locations_from_entities(entities, lookup):
    found_ids = set()
    for entity in entities:
        if entity['entity_group'] == 'LOC':
            loc_name_lower = entity['word'].lower()
            if loc_name_lower in lookup:
                found_ids.add(lookup[loc_name_lower])
    return list(found_ids)

# 4a: BATCH process ARTICLE-level locations
print("Batch processing article-level locations...")
article_bodies = df_articles_with_risks['body'].fillna('').tolist()
article_entities_list = ner_pipeline(article_bodies, batch_size=128)
article_locations = [resolve_locations_from_entities(entities, location_lookup) for entities in article_entities_list]
df_articles_with_risks['article_locations'] = article_locations
print("Extracted article-level locations.")

# 4b: BATCH process SENTENCE-level locations
print("Batch processing sentence-level locations...")
sentence_texts = df_risk_sentences['sentence_text'].fillna('').tolist()
sentence_entities_list = ner_pipeline(sentence_texts, batch_size=128)
sentence_locations = [resolve_locations_from_entities(entities, location_lookup) for entities in sentence_entities_list]
df_risk_sentences['sentence_locations'] = sentence_locations
print("Extracted sentence-level locations.")
print("-" * 30, "\n")


# --- Step 5: Merge and Apply Hierarchical Logic ---
print("# Step 5: Applying Hierarchical Logic and Finalizing Data")
df_merged = pd.merge(
    df_risk_sentences,
    df_articles_with_risks[['article_id', 'language', 'article_locations']],
    on='article_id'
)

def choose_locations(row):
    sentence_specific = {loc for loc in row['sentence_locations'] if len(loc) > 2}
    if sentence_specific: return list(sentence_specific)
    if row['sentence_locations']: return row['sentence_locations']
    article_specific = {loc for loc in row['article_locations'] if len(loc) > 2}
    if article_specific: return list(article_specific)
    return row['article_locations']

df_merged['final_locations'] = df_merged.apply(choose_locations, axis=1)
df_final_exploded = df_merged.explode('final_locations').rename(columns={'final_locations': 'location_id'})
df_final_exploded = df_final_exploded.dropna(subset=['location_id'])
df_final_exploded['location_name_english'] = df_final_exploded['location_id'].map(id_to_english_name_lookup)

final_columns = [
    'article_id', 'date', 'language', 'sentence_text', 'risk_factor',
    'confidence_score', 'location_id', 'location_name_english'
]
df_final_exploded = df_final_exploded[final_columns]

print(f"Created {len(df_final_exploded):,} final, high-precision risk-location pairs.")
print("Sample of final data:")
display(df_final_exploded.head())


# --- Step 6: Save the Geotagged Results ---
print("\n# Step 6: Saving the Final Geotagged Data")
OUTPUT_DIR = os.path.join(DATA_DIR, '03_models')
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(OUTPUT_DIR, 'risk_mentions_geotagged_FINAL.csv')
df_final_exploded.to_csv(output_path, index=False)
print(f"Successfully saved {len(df_final_exploded):,} geotagged risk mentions to: {output_path}")

# Step 1: Loading All Necessary Data & SAMPLING
--- RUNNING ON A SAMPLE of 100 rows ---
All data loaded successfully.
------------------------------ 

# Step 2: Building Location Resolvers
Location resolvers created.
------------------------------ 

# Step 3: Loading Hugging Face Model for NER
GPU found.


Device set to use cuda:0


NER pipeline loaded.
------------------------------ 

# Step 4: Hybrid Geotagging (Optimized with Batching)
Found 148 unique articles for context.
Batch processing article-level locations...
Extracted article-level locations.
Batch processing sentence-level locations...
Extracted sentence-level locations.
------------------------------ 

# Step 5: Applying Hierarchical Logic and Finalizing Data
Created 232 final, high-precision risk-location pairs.
Sample of final data:


Unnamed: 0,article_id,date,language,sentence_text,risk_factor,confidence_score,location_id,location_name_english
0,26712,2024-06-27,english,"""",without international aid,0.91626,iq,iraq
1,26712,2024-06-27,arabic,"""",without international aid,0.91626,sy_dy_2,dayr az-zawr
1,26712,2024-06-27,arabic,"""",without international aid,0.91626,lb_ba_1,beirut
1,26712,2024-06-27,arabic,"""",without international aid,0.91626,sy_hl,aleppo
2,32920,2024-07-22,english,""" The West, Abulhawa reminds us, has a long an...",destructive pattern,0.995453,ps_gz_2,gaza



# Step 6: Saving the Final Geotagged Data
Successfully saved 232 geotagged risk mentions to: ../data/03_models/risk_mentions_geotagged_FINAL.csv
