In [1]:
import pandas as pd
import os
from transformers import pipeline
import torch
import pickle
from tqdm.auto import tqdm

# --- 1. Define Paths and Check if Output Already Exists ---
print("--- Step 1: Initializing and Checking for Existing File ---")
DATA_DIR = '../data'
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, '02_processed')
output_path = os.path.join(PROCESSED_DATA_DIR, 'news_geographically_filtered.pkl')

if os.path.exists(output_path):
    print(f"Output file already exists: {output_path}")
    print("Skipping the filtering process.")
else:
    print("Output file not found. Starting the filtering process...")
    # --- 2. Load All Necessary Data ---
    print("--- Step 2: Loading All Necessary Data ---")
    df_eng = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_eng_processed.pkl'))
    df_ara = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'news_ara_processed.pkl'))
    df_articles = pd.concat([df_eng, df_ara], ignore_index=True)

    with open('../data/01_raw/id_english_location_name.pkl', 'rb') as f:
        eng_locations = pickle.load(f)
    with open('../data/01_raw/id_arabic_location_name.pkl', 'rb') as f:
        ara_locations = pickle.load(f)
    print(f"Loaded {len(df_articles):,} total articles to filter.")
    print("-" * 30, "\n")


    # --- 3. Create a Fast Location Lookup ---
    print("--- Step 3: Building Location Resolver ---")
    def create_lookup(location_dict):
        lookup = {}
        for loc_id, names in location_dict.items():
            for name in names:
                lookup[name.lower()] = loc_id
        return lookup

    location_lookup = create_lookup(eng_locations)
    location_lookup.update(create_lookup(ara_locations))
    print(f"Created a lookup with {len(location_lookup):,} unique location aliases.")
    print("-" * 30, "\n")


    # --- 4. Initialize NER Pipeline ---
    print("--- Step 4: Loading NER Model ---")
    device = 0 if torch.cuda.is_available() else -1
    if device == 0: print("GPU found. Filtering will be fast.")
    else: print("No GPU found.")

    ner_pipeline = pipeline("ner", model="Babelscape/wikineural-multilingual-ner", aggregation_strategy="simple", device=device)
    print("NER pipeline loaded.")
    print("-" * 30, "\n")


# Step 5: Define and Run the Filtering Process (Optimized for Batching)
print("Step 5: Filtering All Articles (Optimized for Batching)")

# Extract the text from the DataFrame into a list for batch processing.
# We truncate to the first 2000 characters, just like in your original function.
article_bodies = df_articles['body'].fillna('').tolist()
texts_to_process = [text[:2000] for text in article_bodies]

print(f"Running NER on {len(texts_to_process):,} articles. This will take a while...")

# Process all texts in a single batch call to the pipeline. This is much faster.
# You can adjust batch_size based on your GPU's VRAM; 64 or 128 is a good start.
all_article_entities = ner_pipeline(texts_to_process, batch_size=128)

print("NER processing complete. Identifying relevant articles...")

# Now, efficiently check the results for our target locations.
relevance_mask = []
for article_entities in tqdm(all_article_entities, desc="Checking NER Results"):
    found_target = False
    for entity in article_entities:
        if entity['entity_group'] == 'LOC' and entity['word'].lower() in location_lookup:
            found_target = True
            break  # Found a match, so we can stop checking this article
    relevance_mask.append(found_target)

df_filtered = df_articles[relevance_mask]

print("\nFiltering complete.")
print(f"  Original number of articles: {len(df_articles):,}")
print(f"  Geographically relevant articles: {len(df_filtered):,}")
print("-" * 30, "\n")


    # --- 6. Save the Filtered Data ---
    print("--- Step 6: Saving the Filtered Dataset ---")
    df_filtered.to_pickle(output_path)
    print(f"Successfully saved the filtered data to: {output_path}")

--- Step 1: Loading All Necessary Data ---
Loaded 172,171 total articles to filter.
------------------------------ 

--- Step 2: Building Location Resolver ---
Created a lookup with 918 unique location aliases.
------------------------------ 

--- Step 3: Loading NER Model ---
GPU found. Filtering will be fast.


Device set to use cuda:0


NER pipeline loaded.
------------------------------ 

--- Step 4: Defining the Filtering Function ---
--- Step 5: Filtering All Articles ---


Filtering Articles:   0%|          | 0/172171 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Filtering complete.
  - Original number of articles: 172,171
  - Geographically relevant articles: 96,516
  - Reduction of 75,655 articles.
------------------------------ 

--- Step 6: Saving the Filtered Dataset ---
Successfully saved the filtered data to: ../data/02_processed/news_geographically_filtered.pkl


In [2]:
df_filtered

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,...,source,authors,image,eventUri,sentiment,wgt,relevance,userHasPermissions,body_cleaned,sentences
0,8216521939,eng,False,2024-07-09,08:15:07,2024-07-09T08:15:07Z,2024-07-09T08:14:39Z,news,0.631373,https://english.enabbaladi.net/archives/2024/0...,...,"{'uri': 'english.enabbaladi.net', 'dataType': ...","[{'uri': 'enab10_ula@english.enabbaladi.net', ...",https://cdn.enabbaladi.net/english/wp-content/...,eng-9713305,-0.505882,503,503,,Hussam al-Mahmoud | Yamen Moghrabi | Hassan Ib...,[Hussam al-Mahmoud | Yamen Moghrabi | Hassan I...
1,8235923227,eng,False,2024-07-21,08:16:52,2024-07-21T08:16:52Z,2024-07-21T08:16:12Z,news,0.000000,https://www.jewishpress.com/indepth/analysis/j...,...,"{'uri': 'jewishpress.com', 'dataType': 'news',...","[{'uri': 'j_e_dyer@jewishpress.com', 'name': '...",https://www.jewishpress.com/wp-content/uploads...,,0.003922,477,477,,t's an excellent axiom to never put your citiz...,[t's an excellent axiom to never put your citi...
2,2024-07-412131568,eng,False,2024-07-05,08:47:41,2024-07-05T08:47:41Z,2024-07-05T08:36:20Z,news,0.788235,https://eu.poconorecord.com/story/opinion/2024...,...,"{'uri': 'eu.poconorecord.com', 'dataType': 'ne...",[],https://www.gannett-cdn.com/authoring/authorin...,eng-9716644,-0.003922,423,423,,Iran and Hamas pose an escalating threat to th...,[Iran and Hamas pose an escalating threat to t...
3,8191876537,eng,False,2024-06-23,21:11:10,2024-06-23T21:11:10Z,2024-06-23T21:09:27Z,news,0.635294,https://syrianobserver.com/foreign-actors/syri...,...,"{'uri': 'syrianobserver.com', 'dataType': 'new...",[],https://syrianobserver.com/wp-content/uploads/...,eng-9669018,-0.545098,423,423,,The simmering conflict between Israel and Hezb...,[The simmering conflict between Israel and Hez...
4,8210853386,eng,False,2024-07-05,08:54:07,2024-07-05T08:54:07Z,2024-07-05T08:53:35Z,news,0.780392,https://www.yahoo.com/news/iran-hamas-targetin...,...,"{'uri': 'yahoo.com', 'dataType': 'news', 'titl...",[],https://s.yimg.com/ny/api/res/1.2/DmwzdEvMx2_4...,eng-9716644,-0.129412,418,418,,Iran and Hamas pose an escalating threat to th...,[Iran and Hamas pose an escalating threat to t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172165,8190806398,ara,False,2024-06-23,01:05:51,2024-06-23T01:05:51Z,2024-06-23T01:02:00Z,news,0.000000,https://www.al-jazirah.com/2024/20240623/ria1.htm,...,"{'uri': 'al-jazirah.com', 'dataType': 'news', ...",[],http://www.al-jazirah.com/2024/20240623/1075.jpg,,,1,1,,هناك احتمال بانهيار السلطة الفلسطينية هذا الصي...,[هناك احتمال بانهيار السلطة الفلسطينية هذا الص...
172166,8190800198,ara,False,2024-06-23,00:50:53,2024-06-23T00:50:53Z,2024-06-23T00:49:57Z,news,0.000000,https://www.arabstoday.net/w-2737/002658-%D8%A...,...,"{'uri': 'arabstoday.net', 'dataType': 'news', ...",[],https://img.arabstoday.net//SusanShaer.jpg,,,1,1,,أتفق مع القول بأنَّ الرأي العام العالمي تغير ل...,[أتفق مع القول بأنَّ الرأي العام العالمي تغير ...
172167,8190795913,ara,False,2024-06-23,00:44:04,2024-06-23T00:44:04Z,2024-06-23T00:42:22Z,news,0.905882,https://www.almamlakatv.com//news/145045,...,"{'uri': 'almamlakatv.com', 'dataType': 'news',...",[],https://www.almamlakatv.com//images/articles/b...,ara-1665868,,1,1,,منظر عام لمبانٍ في مدينة عمّان ويظهر معها علم ...,[منظر عام لمبانٍ في مدينة عمّان ويظهر معها علم...
172168,8190781913,ara,False,2024-06-23,00:18:10,2024-06-23T00:18:10Z,2024-06-23T00:17:13Z,news,0.000000,https://www.albawabhnews.com/5025005,...,"{'uri': 'albawabhnews.com', 'dataType': 'news'...","[{'uri': 'عبد_الله_جمال@albawabhnews.com', 'na...",https://www.albawabhnews.com/UploadCache/libfi...,,,1,1,,تابع أحدث الأخبار عبر تطبيق أفادت قناة القاهرة...,[تابع أحدث الأخبار عبر تطبيق أفادت قناة القاهر...
