In [17]:
import pandas as pd
import os
from transformers import pipeline
import torch
import pickle
import numpy as np

# Step 1: Loading All Necessary Data
print("# Step 1: Loading All Necessary Data")
DATA_DIR = '../data'
MODELS_DIR = os.path.join(DATA_DIR, '03_models')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, '02_processed')

df_risk_sentences = pd.read_csv(os.path.join(MODELS_DIR, 'risk_mentions_SAMPLE_FINAL.csv'))
df_eng = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_eng_processed.pkl'))
df_eng['language'] = 'english'
df_ara = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_ara_processed.pkl'))
df_ara['language'] = 'arabic'
if 'article_id' not in df_eng: df_eng['article_id'] = df_eng.index
if 'article_id' not in df_ara: df_ara['article_id'] = df_ara.index
df_articles = pd.concat([df_eng, df_ara])
with open('../data/01_raw/id_english_location_name.pkl', 'rb') as f: eng_locations = pickle.load(f)
with open('../data/01_raw/id_arabic_location_name.pkl', 'rb') as f: ara_locations = pickle.load(f)
print("All data loaded successfully.")
print("-" * 30, "\n")


# Step 2: Building Location Resolvers
print("# Step 2: Building Location Resolvers")
def create_name_to_id_lookup(location_dict):
    lookup = {}
    for loc_id, names in location_dict.items():
        for name in names: lookup[name.lower()] = loc_id
    return lookup
location_lookup = create_name_to_id_lookup(eng_locations)
location_lookup.update(create_name_to_id_lookup(ara_locations))
id_to_english_name_lookup = {loc_id: names[0] for loc_id, names in eng_locations.items()}
print("Location resolvers created.")
print("-" * 30, "\n")


# Step 3: Initialize Hugging Face NER Pipeline
print("# Step 3: Loading Hugging Face Model for NER")
device = 0 if torch.cuda.is_available() else -1
if device == 0: print("GPU found.")
else: print("No GPU found.")
ner_pipeline = pipeline("ner", model="Babelscape/wikineural-multilingual-ner", aggregation_strategy="simple", device=device)
print("NER pipeline loaded.")
print("-" * 30, "\n")


# Step 4: Hybrid Geotagging (Article and Sentence Level)
print("# Step 4: Hybrid Geotagging (Article & Sentence Levels)")
articles_with_risks_ids = df_risk_sentences['article_id'].unique()
df_articles_with_risks = df_articles[df_articles['article_id'].isin(articles_with_risks_ids)][['article_id', 'body', 'language']].copy()
print(f"Found {len(df_articles_with_risks)} unique articles for context.")

def resolve_locations(text, pipeline, lookup):
    """Shared function to extract and resolve locations from any text."""
    entities = pipeline(text)
    found_ids = set()
    for entity in entities:
        if entity['entity_group'] == 'LOC':
            loc_name_lower = entity['word'].lower()
            if loc_name_lower in lookup: found_ids.add(lookup[loc_name_lower])
    return list(found_ids)

# 4a: Get ARTICLE-level locations (the broad context)
df_articles_with_risks['article_locations'] = df_articles_with_risks['body'].apply(
    lambda text: resolve_locations(text, ner_pipeline, location_lookup)
)
print("Extracted article-level locations.")

# 4b: Get SENTENCE-level locations (the specific context)
df_risk_sentences['sentence_locations'] = df_risk_sentences['sentence_text'].apply(
    lambda text: resolve_locations(text, ner_pipeline, location_lookup)
)
print("Extracted sentence-level locations.")
print("-" * 30, "\n")


# Step 5: Merge and Apply Hierarchical Logic
print("# Step 5: Applying Hierarchical Logic and Finalizing Data")
# Merge article context (language and locations) into the sentence dataframe
df_merged = pd.merge(
    df_risk_sentences,
    df_articles_with_risks[['article_id', 'language', 'article_locations']],
    on='article_id'
)

# HIERARCHICAL LOGIC: Choose the most specific location available
def choose_locations(row):
    # Prioritize specific (longer ID) sentence-level locations
    sentence_specific = {loc for loc in row['sentence_locations'] if len(loc) > 2}
    if sentence_specific:
        return list(sentence_specific)
    
    # Fallback to any sentence-level locations
    if row['sentence_locations']:
        return row['sentence_locations']
        
    # Fallback to specific article-level locations
    article_specific = {loc for loc in row['article_locations'] if len(loc) > 2}
    if article_specific:
        return list(article_specific)
        
    # Finally, use any article-level location as the last resort
    return row['article_locations']

df_merged['final_locations'] = df_merged.apply(choose_locations, axis=1)

# Explode, clean, and add the English name
df_final_exploded = df_merged.explode('final_locations').rename(columns={'final_locations': 'location_id'})
df_final_exploded = df_final_exploded.dropna(subset=['location_id'])
df_final_exploded['location_name_english'] = df_final_exploded['location_id'].map(id_to_english_name_lookup)

# Reorder columns for final output
final_columns = [
    'article_id', 'date', 'language', 'sentence_text', 'risk_factor',
    'confidence_score', 'location_id', 'location_name_english'
]
df_final_exploded = df_final_exploded[final_columns]

print(f"Created {len(df_final_exploded):,} final, high-precision risk-location pairs.")
print("Sample of final data:")
display(df_final_exploded.head())

# Step 1: Loading All Necessary Data


All data loaded successfully.
------------------------------ 

# Step 2: Building Location Resolvers
Location resolvers created.
------------------------------ 

# Step 3: Loading Hugging Face Model for NER
GPU found.


Device set to use cuda:0


NER pipeline loaded.
------------------------------ 

# Step 4: Hybrid Geotagging (Article & Sentence Levels)
Found 244 unique articles for context.
Extracted article-level locations.
Extracted sentence-level locations.
------------------------------ 

# Step 5: Applying Hierarchical Logic and Finalizing Data
Created 1,924 final, high-precision risk-location pairs.
Sample of final data:


Unnamed: 0,article_id,date,language,sentence_text,risk_factor,confidence_score,location_id,location_name_english
0,169,2024-06-27,english,""".",without international aid,0.960398,iq_da_2,dahuk
0,169,2024-06-27,english,""".",without international aid,0.960398,iq_ni_7,shingal
0,169,2024-06-27,english,""".",without international aid,0.960398,iq_bg,baghdad
0,169,2024-06-27,english,""".",without international aid,0.960398,iq_ni_1,akre
0,169,2024-06-27,english,""".",without international aid,0.960398,iq_ts_4,kirkuk
