## One

In [1]:
import pandas as pd
import json
from pathlib import Path
from utils import *
import math
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.metrics import root_mean_squared_error
import numpy as np

In [2]:
p = pd.read_csv("outputs/merged_df_new.csv")
p.shape

(5569, 5)

In [4]:

articles = list(Path("files").glob("*")) + list(Path("district_articles_translated").glob("*"))
articles = [str(article) for article in articles]

all_articles = pd.DataFrame()

for article in articles:
    with open(article, "r") as f:
        try:
            data = json.load(f)
            all_articles = pd.concat([all_articles, pd.json_normalize(data).drop(["url", "d"], axis=1, errors="ignore")], ignore_index=True)

        except json.JSONDecodeError:
            print(f"Error decoding JSON from file: {article}")
            continue


In [5]:
all_articles.columns.to_list()

['title', 'date', 'content']

In [6]:
output_cleaned_2 = pd.read_csv("outputs/output.csv").dropna(axis=0)
output_cleaned_urdupoint_2 = pd.read_csv("outputs/output_cleaned_urdupoint_2.csv").rename(columns={"districts_mentioned": "location"})

output_cleaned_2["title"] = output_cleaned_2["title"].str.strip()
output_cleaned_urdupoint_2["title"] = output_cleaned_urdupoint_2["title"].str.strip()
all_articles["title"] = all_articles["title"].str.strip()

# all_articles.drop("date", axis=1, inplace=True, errors="ignore")

print(f"The shapes are {output_cleaned_2.shape}, {output_cleaned_urdupoint_2.shape}, {all_articles.shape}")
print(f"Sum of the shapes of output files is : {output_cleaned_2.shape[0] + output_cleaned_urdupoint_2.shape[0]}")


The shapes are (7722, 4), (2695, 4), (163405, 3)
Sum of the shapes of output files is : 10417


In [7]:
output_cleaned_2.head(10)

Unnamed: 0,title,date,location,relevant_features
0,"Lucky Marwat: Terrorist attack on police post,...",2023-01-01,Lakki Marwat,terrorism
1,"Flood, 'Early Warning System' for Floods, will...",2023-01-01,Punjab,"floods, humanitarian situation, economic crisis"
2,Inflation reached 24.5 % in December,2023-01-02,Punjab,"rising inflation, economic crisis, food insecu..."
3,"'Best, even the worst': political leaders' rea...",2023-01-01,Balochistan,"floods, economic crisis, politically engineere..."
4,"In 2023, terrorist incidents in Pakistan are l...",2023-01-01,Khyber Pakhtunkhwa,conflict
5,"'Make home, family and life - now go to the land'",2023-01-01,Karak,"displaced, migration, humanitarian situation"
6,"Protests and terrorism in Balochistan, 'Wake -...",2023-01-01,Balochistan,"conflict, terrorism, economic crisis, humanita..."
7,National Security Committee meeting: Repeat te...,2023-01-02,Balochistan,"economic crisis, terrorism"
8,"Sugar, flour, ghee prices at utility stores",2023-01-01,Punjab,"rising food prices, economic crisis"
9,"If I do not go to the IMF, they will be defaul...",2023-01-01,Sindh,"economic crisis, rising inflation, aid appeal"


In [8]:
output_cleaned_urdupoint_2.head(10)

Unnamed: 0,title,date,location,relevant_features
0,District Administration Quetta crackdown again...,2025-03-25,Quetta,"price rise, corruption, economic crisis"
1,No Heading Found,2025-03-27,Quetta,"terrorism, tragedy, human rights abuses, unres..."
2,"Despite the PPP government in Balochistan, Jay...",2025-03-28,Quetta,"corruption, displaced, human rights abuses, ec..."
3,Haji Atta Mohammad Bangalzai has turned the ci...,2025-03-26,Quetta,"epidemics, humanitarian situation, health crisis"
4,District administration will not allow anyone ...,2025-04-11,Quetta,"disruption to farming, economic crisis, price ..."
5,) Jamaat -e -Islami condemns Balochistan's sta...,2025-03-30,Quetta,"repression, human rights abuses, political ins..."
6,Food department warehouses arrested three pers...,2025-04-09,Quetta,"corruption, food insecurity, theft, economic c..."
7,"BNP's peaceful long march is condemnable, Nawa...",2025-03-29,Quetta,"conflict, repression, displaced, human rights ..."
8,DC Chaman Habib Ahmed Bangalzai's Facility and...,2025-04-09,Quetta,"refugees, displaced"
9,Governor Balochistan Jafar Khan Mandokhel is t...,2025-04-08,Quetta,"migration, economic crisis, lack of basic faci..."


In [9]:
print(f"Output cleaned 2 columns are {output_cleaned_2.columns.to_list()}")
print(f"Output cleaned urdupoint 2 columns are {output_cleaned_urdupoint_2.columns.to_list()}")
print(f"All articles columns are {all_articles.columns.to_list()}")

Output cleaned 2 columns are ['title', 'date', 'location', 'relevant_features']
Output cleaned urdupoint 2 columns are ['title', 'date', 'location', 'relevant_features']
All articles columns are ['title', 'date', 'content']


In [10]:
merged_df1 = output_cleaned_2.merge(all_articles, on=["title", "date"], how="inner")
merged_df2 = output_cleaned_urdupoint_2.merge(all_articles, on=["title", "date"], how="inner")
merged_df = pd.concat([merged_df1, merged_df2], ignore_index=True)
merged_df.shape


(10485, 5)

In [11]:
merged_df.shape

(10485, 5)

In [12]:
merged_df.isnull().sum()

title                0
date                 0
location             0
relevant_features    0
content              0
dtype: int64

In [13]:
merged_df.drop_duplicates(subset=["title", "content"], inplace=True)

In [14]:
merged_df.shape

(10400, 5)

In [15]:
merged_df.head(10)

Unnamed: 0,title,date,location,relevant_features,content
0,"Lucky Marwat: Terrorist attack on police post,...",2023-01-01,Lakki Marwat,terrorism,A terrorist attack on a police post in Shahbaz...
1,"Flood, 'Early Warning System' for Floods, will...",2023-01-01,Punjab,"floods, humanitarian situation, economic crisis",Chief Minister Punjab Chaudhry Pervaiz Elahi h...
2,Inflation reached 24.5 % in December,2023-01-02,Punjab,"rising inflation, economic crisis, food insecu...","Inflation is not taking place in the country, ..."
3,"'Best, even the worst': political leaders' rea...",2023-01-01,Balochistan,"floods, economic crisis, politically engineere...",Pakistani politicians expressed their best wis...
4,"In 2023, terrorist incidents in Pakistan are l...",2023-01-01,Khyber Pakhtunkhwa,conflict,In a decade for security personnel in Pakistan...
5,"'Make home, family and life - now go to the land'",2023-01-01,Karak,"displaced, migration, humanitarian situation","In the scorching sun of Karachi, Qari Zainuddi..."
6,"Protests and terrorism in Balochistan, 'Wake -...",2023-01-01,Balochistan,"conflict, terrorism, economic crisis, humanita...",The system of life has been suspended in Baloc...
7,National Security Committee meeting: Repeat te...,2023-01-02,Balochistan,"economic crisis, terrorism",The National Security Committee meeting reaffi...
8,"Sugar, flour, ghee prices at utility stores",2023-01-01,Punjab,"rising food prices, economic crisis","Sugar, flour and ghee prices have been increas..."
9,"If I do not go to the IMF, they will be defaul...",2023-01-01,Sindh,"economic crisis, rising inflation, aid appeal",Former Prime Minister and Pakistan Tehreek -e ...


In [16]:
merged_df.iloc[3235:3245, 1]

3235    February 17, 2023
3236     04 October, 2023
3237       04 April, 2024
3238        07 June, 2023
3239        06 July, 2024
3240    07 November, 2023
3241       April 16, 2024
3242      06 August, 2023
3243    February 13, 2023
3244    February 25, 2024
Name: date, dtype: object

In [17]:
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='raise', dayfirst=True, format="mixed").dt.strftime('%Y-%m-%d')

In [18]:
merged_df.iloc[3235:3245, 1]

3235    2023-02-17
3236    2023-10-04
3237    2024-04-04
3238    2023-06-07
3239    2024-07-06
3240    2023-11-07
3241    2024-04-16
3242    2023-08-06
3243    2023-02-13
3244    2024-02-25
Name: date, dtype: object

In [19]:
merged_df.head(10)

Unnamed: 0,title,date,location,relevant_features,content
0,"Lucky Marwat: Terrorist attack on police post,...",2023-01-01,Lakki Marwat,terrorism,A terrorist attack on a police post in Shahbaz...
1,"Flood, 'Early Warning System' for Floods, will...",2023-01-01,Punjab,"floods, humanitarian situation, economic crisis",Chief Minister Punjab Chaudhry Pervaiz Elahi h...
2,Inflation reached 24.5 % in December,2023-01-02,Punjab,"rising inflation, economic crisis, food insecu...","Inflation is not taking place in the country, ..."
3,"'Best, even the worst': political leaders' rea...",2023-01-01,Balochistan,"floods, economic crisis, politically engineere...",Pakistani politicians expressed their best wis...
4,"In 2023, terrorist incidents in Pakistan are l...",2023-01-01,Khyber Pakhtunkhwa,conflict,In a decade for security personnel in Pakistan...
5,"'Make home, family and life - now go to the land'",2023-01-01,Karak,"displaced, migration, humanitarian situation","In the scorching sun of Karachi, Qari Zainuddi..."
6,"Protests and terrorism in Balochistan, 'Wake -...",2023-01-01,Balochistan,"conflict, terrorism, economic crisis, humanita...",The system of life has been suspended in Baloc...
7,National Security Committee meeting: Repeat te...,2023-01-02,Balochistan,"economic crisis, terrorism",The National Security Committee meeting reaffi...
8,"Sugar, flour, ghee prices at utility stores",2023-01-01,Punjab,"rising food prices, economic crisis","Sugar, flour and ghee prices have been increas..."
9,"If I do not go to the IMF, they will be defaul...",2023-01-01,Sindh,"economic crisis, rising inflation, aid appeal",Former Prime Minister and Pakistan Tehreek -e ...


In [20]:
merged_df["location"] = merged_df["location"].str.split(",").apply(lambda x: [loc.strip() for loc in x])
merged_df

Unnamed: 0,title,date,location,relevant_features,content
0,"Lucky Marwat: Terrorist attack on police post,...",2023-01-01,[Lakki Marwat],terrorism,A terrorist attack on a police post in Shahbaz...
1,"Flood, 'Early Warning System' for Floods, will...",2023-01-01,[Punjab],"floods, humanitarian situation, economic crisis",Chief Minister Punjab Chaudhry Pervaiz Elahi h...
2,Inflation reached 24.5 % in December,2023-01-02,[Punjab],"rising inflation, economic crisis, food insecu...","Inflation is not taking place in the country, ..."
3,"'Best, even the worst': political leaders' rea...",2023-01-01,[Balochistan],"floods, economic crisis, politically engineere...",Pakistani politicians expressed their best wis...
4,"In 2023, terrorist incidents in Pakistan are l...",2023-01-01,[Khyber Pakhtunkhwa],conflict,In a decade for security personnel in Pakistan...
...,...,...,...,...,...
10480,G* tribal disputes are a major obstacle to the...,2021-07-11,[Ziarat],"conflict, tribal disputes, political instability",F Ziarat (Urdu point newspaper - online. July ...
10481,"Snowfall continues in Ziarat Valley, people fa...",2008-12-20,[Ziarat],"migration, humanitarian situation, economic cr...","Ziarat (UrduPoint Latest News December 20, 200..."
10482,"Ziarat suffers from drought, lack of rainfall ...",2016-01-27,[Ziarat],"lack of rains, water availability, agricultura...",Ziarat (Urdu Point Newsletter - Online - Janua...
10483,US diplomat visits earthquake -hit areas in pi...,2008-11-04,[Ziarat],"humanitarian situation, foreign aid, economic ...",Ziarat (UrduPoint News Updated 04 Nov 2008) An...


In [21]:
merged_df = merged_df.explode("location")
merged_df["location"] = merged_df["location"].str.strip()

In [22]:
merged_df = merged_df[merged_df["location"]!=""]
merged_df.drop_duplicates(inplace=True)
merged_df.dropna(inplace=True)
merged_df.shape

(10962, 5)

In [23]:
indices = []

for index, row in merged_df.iterrows():
    if row["location"].lower() not in row["title"].lower() and row["location"].lower() not in row["content"].lower():
        print(f"Location {row['location']} not found in title or content for row {row['title']}")
        indices.append(index)
merged_df.drop(index=indices, inplace=True)
merged_df.shape

Location Punjab not found in title or content for row Inflation reached 24.5 % in December
Location Balochistan not found in title or content for row 'Best, even the worst': political leaders' reaction to the year 2022
Location Karak not found in title or content for row 'Make home, family and life - now go to the land'
Location Punjab not found in title or content for row Sugar, flour, ghee prices at utility stores
Location Karak not found in title or content for row KARACHI: Sui failed to provide gas according to promise, urban torture
Location Punjab not found in title or content for row Pakistan will have to run from the swamp that we have not run till today, Imran Khan
Location Punjab not found in title or content for row PTI's White Paper: Country debt will be 6.4 trillion in 3 months, Shaukat Tareen
Location Sohbatpur not found in title or content for row In extreme conditions, the government took over, no doubt is on the rise, the Prime Minister
Location Khairpur not found in t

(5569, 5)

In [37]:
merged_df.to_csv("outputs/merged_df_new.csv", index=False)


In [25]:
grouped_locations = merged_df.groupby(by=["location"]).size().reset_index(name="count").sort_values(by="count", ascending=False)

In [26]:
grouped_locations

Unnamed: 0,location,count
41,Punjab,794
3,Balochistan,547
48,Sindh,498
26,Khyber Pakhtunkhwa,472
42,Quetta,199
11,Gwadar,193
27,Kurram,176
43,Sanghar,156
53,Tharparkar,148
40,Pishin,142


In [26]:
grouped_locations.shape

(58, 2)

In [27]:
print(f"Number of districts {len(DISTRICTS)}")
print(f"Number of Provinces {len(PROVINCES)}")

Number of districts 68
Number of Provinces 6


In [28]:
sorted(grouped_locations["location"].unique())

['Azad Jammu and Kashmir',
 'Badin',
 'Bajaur',
 'Balochistan',
 'Bannu',
 'Batagram',
 'Buner',
 'Chagai',
 'Dadu',
 'Dera Ismail Khan',
 'Ghotki',
 'Gwadar',
 'Hangu',
 'Jacobabad',
 'Jaffarabad',
 'Jamshoro',
 'Jhal Magsi',
 'Kachhi',
 'Kalat',
 'Karak',
 'Kashmore',
 'Kech',
 'Khairpur',
 'Kharan',
 'Khuzdar',
 'Khyber',
 'Khyber Pakhtunkhwa',
 'Kurram',
 'Lakki Marwat',
 'Larkana',
 'Lasbela',
 'Loralai',
 'Lower Dir',
 'Mirpur Khas',
 'Mohmand',
 'Nasirabad',
 'North Waziristan',
 'Nushki',
 'Orakzai',
 'Panjgur',
 'Pishin',
 'Punjab',
 'Quetta',
 'Sanghar',
 'Shaheed Benazir Abad',
 'Shangla',
 'Shikarpur',
 'Sibi',
 'Sindh',
 'South Waziristan',
 'Sujawal',
 'Swat',
 'Tank',
 'Tharparkar',
 'Thatta',
 'Upper Dir',
 'Zhob',
 'Ziarat']

In [29]:
average_word_count = math.ceil(merged_df["content"].apply(lambda x: len(x.split())).mean())
print(f"Average word count per article: {average_word_count}")

Average word count per article: 370


In [30]:
nine_months_ago = datetime(*PREDICTION_DATE) - relativedelta(months=BACKDATE_MONTHS)
print(f"Date 9 months ago: {nine_months_ago.strftime('%Y-%m-%d')}")

Date 9 months ago: 2024-02-01


In [31]:
nine_months_ago_df = merged_df[merged_df["date"] >= nine_months_ago.strftime('%Y-%m-%d')]

In [None]:
nine_months_ago_df.head(10)

In [None]:
nine_months_ago_df.shape

In [None]:
grouped_nine_months_df = nine_months_ago_df.groupby(by=["location"]).size().reset_index(name="count").sort_values(by="count", ascending=False)
grouped_nine_months_df

In [None]:
nine_months_ago_df["location"].unique()

In [None]:
grouped_nine_months_df[grouped_nine_months_df["count"]>=30]


In [3]:
gt = pd.read_csv("inputs/ground_truth_ipc.csv")
our_results = pd.read_csv(RESULTS_FILE).dropna(subset=["ipc_phase"])

print(f"Results file is {RESULTS_FILE}")

month, year = map(str.strip, PREDICTION_PERIOD.split(","))

gt_filtered = gt.query("Months == @month and Year == @year").copy()

gt_filtered.rename(columns={"District": "district", "Area Phase": "gt_ipc"}, inplace=True)
our_results.rename(columns={"ipc_phase": "predicted_ipc"}, inplace=True)

combined = gt_filtered.merge(our_results, on="district", how="left")
combined["Mismatch"] = (combined["gt_ipc"] != combined["predicted_ipc"]).map({True: "Yes", False: "No"})
combined.drop(columns=["Months", "Year", "Province"], inplace=True, errors='ignore')  

column_order = ["district", "gt_ipc", "predicted_ipc", "Mismatch"] + [col for col in combined.columns if col not in {"district", "gt_ipc", "predicted_ipc", "Mismatch"}]
combined = combined[column_order]
combined.dropna(inplace=True)


Results file is outputs/results_ipc_gpt-4o_Nov-Mar,2024-2025.csv


In [4]:
combined.head(10)

Unnamed: 0,district,gt_ipc,predicted_ipc,Mismatch,province,prediction_period,summary,articles,justification,features,weather_data
0,Chagai,3,3,No,Balochistan,"Nov-Mar,2024-2025",### Key Findings\n\n**Food Availability and Ac...,"[{""title"": ""Afghan refugees continue to return...","Based on the information provided, several key...","{""2024-01"": {""temperature_2m_mean"": 11.5985832...","{""2024-01"": {""temperature_2m_mean"": 11.5985832..."
1,Gwadar,3,3,No,Balochistan,"Nov-Mar,2024-2025",### Key Findings:\n\n#### Food Availability an...,"[{""title"": ""Lumpy Skin Disease is spreading ra...","Based on the provided information, Gwadar is l...","{""2024-01"": {""temperature_2m_mean"": 22.3987522...","{""2024-01"": {""temperature_2m_mean"": 22.3987522..."
2,Jaffarabad,3,3,No,Balochistan,"Nov-Mar,2024-2025",**Key Findings:**\n\n**Food Availability and A...,"[{""title"": ""The first case of polio in Jaffara...",The information from the summary suggests sign...,"{""2024-01"": {""temperature_2m_mean"": 18.2669181...","{""2024-01"": {""temperature_2m_mean"": 18.2669181..."
3,Jhal Magsi,3,4,Yes,Balochistan,"Nov-Mar,2024-2025",**Key Findings**\n\n1. **Food Availability and...,"[{""title"": ""River rivers flowed after torrenti...","Based on the information provided, Jhal Magsi ...","{""2024-01"": {""temperature_2m_mean"": 16.8127498...","{""2024-01"": {""temperature_2m_mean"": 16.8127498..."
4,Kachhi,3,3,No,Balochistan,"Nov-Mar,2024-2025",## Key Findings:\n\n### Food Availability and ...,"[{""title"": ""27 terrorists killed"", ""date"": ""20...",The situation in Kachhi reveals significant st...,"{""2024-01"": {""temperature_2m_mean"": 18.9614162...","{""2024-01"": {""temperature_2m_mean"": 18.9614162..."
5,Kalat,3,3,No,Balochistan,"Nov-Mar,2024-2025",## Key Findings:\n\n### Food Availability and ...,"[{""title"": ""Balochistan: Akhtar Mengal and gov...",The situation in Kalat suggests an IPC Phase 3...,"{""2024-01"": {""temperature_2m_mean"": 4.04041719...","{""2024-01"": {""temperature_2m_mean"": 4.04041719..."
6,Kech,3,3,No,Balochistan,"Nov-Mar,2024-2025",## Key Findings\n\n### Food Availability and A...,"[{""title"": ""Balochistan: Armed men in Turbat s...",The district of Kech is facing significant cha...,"{""2024-01"": {""temperature_2m_mean"": 12.0801649...","{""2024-01"": {""temperature_2m_mean"": 12.0801649..."
7,Kharan,3,3,No,Balochistan,"Nov-Mar,2024-2025",### Key Findings:\n\n#### Food Availability an...,"[{""title"": ""Kharan: Grenade attack on home, 3 ...",Based on the provided information and weather ...,"{""2024-01"": {""temperature_2m_mean"": 12.6739149...","{""2024-01"": {""temperature_2m_mean"": 12.6739149..."
8,Khuzdar,3,3,No,Balochistan,"Nov-Mar,2024-2025",**Key Findings:**\n\n- **Food Availability and...,"[{""title"": ""The problem of Balochistan has gon...",Based on the reported conditions and available...,"{""2024-01"": {""temperature_2m_mean"": 12.4886655...","{""2024-01"": {""temperature_2m_mean"": 12.4886655..."
9,Killa Abdullah,3,3,No,Balochistan,"Nov-Mar,2024-2025",### Key Findings\n\n1. **Food Availability and...,"[{""title"": ""\""Recent incidents of terrorism in...",The combination of ongoing terrorism and secur...,"{""2024-01"": {""temperature_2m_mean"": 4.17566680...","{""2024-01"": {""temperature_2m_mean"": 4.17566680..."


In [5]:
print(f"Combined shape is {combined.shape}")

Combined shape is (68, 11)


In [6]:
combined[combined["Mismatch"] == "No"].shape[0]

56

In [7]:
combined[combined["Mismatch"] == "No"].shape[0]/combined.shape[0] * 100

82.35294117647058

In [8]:
print(f"Root mean squared error is : {root_mean_squared_error(combined['gt_ipc'], combined['predicted_ipc'])}")

Root mean squared error is : 0.42008402520840293


In [33]:
y_true = combined["gt_ipc"].values
y_pred = combined["predicted_ipc"].values

In [34]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.2%}")

Accuracy: 80.88%


In [35]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, zero_division=0.0))


              precision    recall  f1-score   support

           2       0.00      0.00      0.00         4
           3       0.93      0.86      0.89        64
           4       0.00      0.00      0.00         0

    accuracy                           0.81        68
   macro avg       0.31      0.29      0.30        68
weighted avg       0.88      0.81      0.84        68



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## New Section

In [None]:
import pandas as pd

In [None]:
afg = pd.read_json("inputs/Afghanistan_data.json")
lebanon = pd.read_json("inputs/Lebanon_articles.json")

In [None]:
afg.head(10)

In [None]:
print(f"Afg shape is {afg.shape}")
print(f"Lebanon shape is {lebanon.shape}")

In [None]:
ipc_afg = pd.read_csv("inputs/IPC classifications  - Afghanistan.csv").rename(columns={"Area Phase": "gt_ipc"})
ipc_lebanon = pd.read_csv("inputs/IPC classifications  - Lebanon.csv").rename(columns={"Area Phase": "gt_ipc"})

In [None]:
ipc_afg.head(10)

In [None]:
ipc_lebanon.head(10)

In [None]:
ipc_lebanon.columns.values

In [None]:
ipc_lebanon.head(10)

In [None]:
afg["district"] = [[] for _ in range(len(afg))]
unique_districts = ipc_afg["District"].dropna().unique()
for district in unique_districts:
    pattern = district.lower().strip()
    mask = (afg["title"].str.lower().str.contains(pattern, na=False) | afg["content"].str.lower().str.contains(pattern, na=False))
    afg.loc[mask, "district"] = afg.loc[mask, "district"].apply(lambda x : x + [district])

In [None]:
lebanon["district"] = [[] for _ in range(len(lebanon))]
unique_districts = ipc_lebanon["District"].dropna().unique()
for district in unique_districts:
    pattern = district.lower().strip()
    mask = (lebanon["title"].str.lower().str.contains(pattern, na=False) | lebanon["content"].str.lower().str.contains(pattern, na=False))
    lebanon.loc[mask, "district"] = lebanon.loc[mask, "district"].apply(lambda x : x + [district])

In [None]:
afg.head(10)

In [None]:
afg["district"] = afg["district"].apply(lambda x: None if x == [] else x)
lebanon["district"] = lebanon["district"].apply(lambda x: None if x == [] else x)

In [None]:
gt = pd.read_csv("inputs/ground_truth_ipc.csv")


In [None]:
afg.dropna(subset=["district"], inplace=True)
lebanon.dropna(subset=["district"], inplace=True)
print(f"Shape of afg after dropping NaN districts: {afg.shape}")
print(f"Shape of lebanon after dropping NaN districts: {lebanon.shape}")

In [None]:
afg = afg.explode("district")
lebanon = lebanon.explode("district")
afg["district"] = afg["district"].str.strip()
lebanon["district"] = lebanon["district"].str.strip()

In [None]:
afg.dropna(subset=["district"], inplace=True)
lebanon.dropna(subset=["district"], inplace=True)
afg.drop_duplicates(inplace=True)
lebanon.drop_duplicates(inplace=True)
print(f"Shape of afg after dropping NaN districts: {afg.shape}")
print(f"Shape of lebanon after dropping NaN districts: {lebanon.shape}")

In [None]:
afg["date"] = pd.to_datetime(afg["date"], errors='raise', dayfirst=True, format="mixed").dt.strftime('%Y-%m-%d')
lebanon["date"] = pd.to_datetime(lebanon["date"], errors='raise', dayfirst=True, format="mixed").dt.strftime('%Y-%m-%d')

In [None]:
print(f"Range of dates in afg: {afg['date'].min()} to {afg['date'].max()}")
print(f"Range of dates in lebanon: {lebanon['date'].min()} to {lebanon['date'].max()}")

In [None]:
ipc_afg["Months"] = ipc_afg["Months"].astype(str).str.replace(r"\s+", "", regex=True)
ipc_afg["Year"] = ipc_afg["Year"].astype(str).str.replace(r"\s+", "", regex=True)
ipc_lebanon["Year"] = ipc_lebanon["Year"].astype(str).str.replace(r"\s+", "", regex=True)
ipc_lebanon["Months"] = ipc_lebanon["Months"].astype(str).str.replace(r"\s+", "", regex=True)
gt["Months"] = gt["Months"].astype(str).str.replace(r"\s+", "", regex=True)
gt["Year"] = gt["Year"].astype(str).str.replace(r"\s+", "", regex=True)

In [None]:
ipc_afg.to_csv("inputs/afg_ipc.csv", index=False)
ipc_lebanon.to_csv("inputs/leb_ipc.csv", index=False)
gt.to_csv("inputs/pak_ipc.csv", index=False)
afg.to_csv("outputs/afg_articles.csv", index=False)
lebanon.to_csv("outputs/lebanon_articles.csv", index=False)

In [None]:
unique_periods_afg = ipc_afg[["Months", "Year"]].drop_duplicates()
unique_periods_lebanon = ipc_lebanon[["Months", "Year"]].drop_duplicates()
unique_periods_gt = gt[["Months", "Year"]].drop_duplicates()

In [None]:
unique_periods_afg

In [None]:
unique_periods_lebanon

In [None]:
unique_periods_gt

In [None]:
afg["district"].unique()

In [None]:
lebanon["district"].unique()