In [8]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load Necessary Data (Corrected) ---
print("--- Step 1: Loading Processed Data ---")
DATA_DIR = '../data'
MODELS_DIR = os.path.join(DATA_DIR, '03_models')
RAW_DATA_DIR = os.path.join(DATA_DIR, '01_raw')

# CORRECTED: Load the file you just created from notebook 05
df_final_exploded = pd.read_csv(os.path.join(MODELS_DIR, 'risk_mentions_geotagged_FINAL.csv'))

# CORRECTED: Load the risk factor cluster file with the correct name
df_clusters = pd.read_excel(os.path.join(RAW_DATA_DIR, 'risk-factors-categories.xlsx'))

# This file is needed for normalization
df_geo_articles = pd.read_pickle(os.path.join(DATA_DIR, '02_processed', 'news_geographically_filtered.pkl'))

print("Data loaded successfully.")
print("-" * 30, "\n")

# --- 2. Prepare the Data (Corrected) ---
print("--- Step 2: Preparing Data for Aggregation ---")

# --- HELPFUL DEBUGGING STEP ---
# Print the column names to see what they are actually called
print("Columns in df_clusters (from risk-factors-categories.xlsx):")
print(df_clusters.columns)
# -----------------------------

# Ensure the 'date' column is in datetime format
df_final_exploded['date'] = pd.to_datetime(df_final_exploded['date'])

# Merge the risk mentions with their thematic clusters
# CORRECTED: Changed 'right_on' to the correct column name. It's likely 'risk_factor'.
df_merged = pd.merge(df_final_exploded, df_clusters, on='risk_factor') # Using 'on' is cleaner when column names match

print("\nMerged risk mentions with thematic clusters.")
print("-" * 30, "\n")


# --- 3. Aggregate Risk Mentions (Corrected) ---
print("--- Step 3: Aggregating Daily Risk Counts ---")

# CORRECTED: Changed 'theme' to 'cluster' to match the actual column name
df_daily_counts = df_merged.groupby([pd.Grouper(key='date', freq='D'), 'location_id', 'location_name_english', 'cluster']).size().reset_index(name='risk_mention_count')

print("Calculated raw daily counts of risk mentions per location and theme.")
display(df_daily_counts.head())
print("-" * 30, "\n")


# --- 4. Normalization (Crucial Step) ---
# To avoid bias from varying news volume, we normalize by the number of articles published per day.
# NOTE: This is a simplified normalization. A more advanced approach would be to get article counts *per location*,
# which would require re-running the geotagger on ALL articles, not just those with risks.
# For this assessment, normalizing by total daily articles is a reasonable simplification.

print("--- Step 4: Normalizing Risk Counts ---")
df_geo_articles['date'] = pd.to_datetime(df_geo_articles['date'])
daily_article_volume = df_geo_articles.groupby(pd.Grouper(key='date', freq='D')).size().reset_index(name='total_articles_published')

# Merge the risk counts with the total article volume for normalization
df_normalized = pd.merge(df_daily_counts, daily_article_volume, on='date')
df_normalized['normalized_risk'] = df_normalized['risk_mention_count'] / df_normalized['total_articles_published']

print("Normalized risk scores by total daily article volume.")
display(df_normalized.head())
print("-" * 30, "\n")


# --- 5. Calculate Thematic and Composite Risk Indices ---
print("--- Step 5: Constructing Risk Indices ---")
# The 'normalized_risk' is already our Thematic Risk Index for each theme.
# Now, we calculate the Composite Risk Index (CRI) by averaging themes per day/location.

df_cri = df_normalized.groupby(['date', 'location_id', 'location_name_english'])['normalized_risk'].mean().reset_index(name='composite_risk_index')

print("Calculated Composite Risk Index (CRI).")
display(df_cri.head())
print("-" * 30, "\n")


# --- 6. Save the Final Time-Series Data ---
print("--- Step 6: Saving Final Index Data ---")
OUTPUT_DIR = os.path.join(DATA_DIR, '04_feature')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save the thematic and composite indices
df_normalized.to_csv(os.path.join(OUTPUT_DIR, 'thematic_risk_indices.csv'), index=False)
df_cri.to_csv(os.path.join(OUTPUT_DIR, 'composite_risk_index.csv'), index=False)

print(f"Final time-series data saved to: {OUTPUT_DIR}")
print("-" * 30, "\n")


# --- 7. Example Visualization ---
print("--- Step 7: Example Visualization ---")
# Let's visualize the CRI for a specific location, e.g., Baghdad
location_to_plot = 'Baghdad'
df_plot = df_cri[df_cri['location_name_english'] == location_to_plot]

if not df_plot.empty:
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(15, 7))
    
    ax.plot(df_plot['date'], df_plot['composite_risk_index'], marker='o', linestyle='-', label='Composite Risk Index')
    ax.set_title(f"Daily Composite Risk Index for {location_to_plot}", fontsize=16)
    ax.set_ylabel("Risk Index (Normalized Score)")
    ax.set_xlabel("Date")
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print(f"No data found for location: {location_to_plot}")

--- Step 1: Loading Processed Data ---


Data loaded successfully.
------------------------------ 

--- Step 2: Preparing Data for Aggregation ---
Columns in df_clusters (from risk-factors-categories.xlsx):
Index(['risk_factor', 'cluster'], dtype='object')

Merged risk mentions with thematic clusters.
------------------------------ 

--- Step 3: Aggregating Daily Risk Counts ---
Calculated raw daily counts of risk mentions per location and theme.


Unnamed: 0,date,location_id,location_name_english,cluster,risk_mention_count
0,2024-06-24,sy,syria,humanitarian aid,2
1,2024-06-25,ps_gz,gaza,food crisis,1
2,2024-06-25,ps_gz_2,gaza,conflicts and violence,2
3,2024-06-25,ps_gz_2,gaza,food crisis,2
4,2024-06-25,ps_gz_5,rafah,food crisis,1


------------------------------ 

--- Step 4: Normalizing Risk Counts ---
Normalized risk scores by total daily article volume.


Unnamed: 0,date,location_id,location_name_english,cluster,risk_mention_count,total_articles_published,normalized_risk
0,2024-06-24,sy,syria,humanitarian aid,2,3778,0.000529
1,2024-06-25,ps_gz,gaza,food crisis,1,3847,0.00026
2,2024-06-25,ps_gz_2,gaza,conflicts and violence,2,3847,0.00052
3,2024-06-25,ps_gz_2,gaza,food crisis,2,3847,0.00052
4,2024-06-25,ps_gz_5,rafah,food crisis,1,3847,0.00026


------------------------------ 

--- Step 5: Constructing Risk Indices ---
Calculated Composite Risk Index (CRI).


Unnamed: 0,date,location_id,location_name_english,composite_risk_index
0,2024-06-24,sy,syria,0.000529
1,2024-06-25,ps_gz,gaza,0.00026
2,2024-06-25,ps_gz_2,gaza,0.00052
3,2024-06-25,ps_gz_5,rafah,0.00026
4,2024-06-25,sy_hl,aleppo,0.00026


------------------------------ 

--- Step 6: Saving Final Index Data ---
Final time-series data saved to: ../data/04_feature
------------------------------ 

--- Step 7: Example Visualization ---
No data found for location: Baghdad


In [9]:
import pandas as pd
import os
import pickle
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
from tqdm.auto import tqdm

# --- 1. Load All Necessary Data ---
print("--- Step 1: Loading Processed Data ---")
DATA_DIR = '../data'
FEATURE_DIR = os.path.join(DATA_DIR, '04_feature')
RAW_DATA_DIR = os.path.join(DATA_DIR, '01_raw')

# Load the risk index data
df_cri = pd.read_csv(os.path.join(FEATURE_DIR, 'composite_risk_index.csv'))

# Load the English location names dictionary
with open(os.path.join(RAW_DATA_DIR, 'id_english_location_name.pkl'), 'rb') as f:
    eng_locations = pickle.load(f)

print("Data loaded successfully.")
print("-" * 30, "\n")


# --- 2. Geocode All 357 Locations ---
print(f"--- Step 2: Geocoding All {len(eng_locations)} Locations ---")
# Initialize the geocoder (Nominatim is a free service from OpenStreetMap)
geolocator = Nominatim(user_agent="food_crisis_mapper")

# RateLimiter prevents sending requests too fast, which can get you blocked
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Create a dictionary to store our results: {location_id: {'lat': lat, 'lon': lon}}
location_coords = {}
tqdm.pandas(desc="Geocoding Locations")

# We only need to geocode the primary English name for each location ID
location_names_to_geocode = {loc_id: names[0] for loc_id, names in eng_locations.items()}

for loc_id, name in tqdm(location_names_to_geocode.items(), desc="Fetching Coordinates"):
    try:
        location = geocode(name)
        if location:
            location_coords[loc_id] = {'lat': location.latitude, 'lon': location.longitude}
    except Exception as e:
        print(f"Error geocoding '{name}': {e}")

print(f"\nSuccessfully geocoded {len(location_coords)} out of {len(eng_locations)} locations.")
print("-" * 30, "\n")


# --- 3. Merge Coordinates and Aggregate Data ---
print("--- Step 3: Merging Coordinates and Aggregating Risk Scores ---")
# Add lat/lon to the dataframe using the location_id
df_cri['lat'] = df_cri['location_id'].map(lambda x: location_coords.get(x, {}).get('lat'))
df_cri['lon'] = df_cri['location_id'].map(lambda x: location_coords.get(x, {}).get('lon'))

# Drop rows where we couldn't find coordinates
df_cri.dropna(subset=['lat', 'lon'], inplace=True)

# Calculate the AVERAGE risk for each location over the whole period
df_map_data = df_cri.groupby(['location_name_english', 'lat', 'lon'])['composite_risk_index'].mean().reset_index()

# Normalize the risk index to a 0-1 scale
df_map_data['normalized_risk'] = (df_map_data['composite_risk_index'] - df_map_data['composite_risk_index'].min()) / \
                                 (df_map_data['composite_risk_index'].max() - df_map_data['composite_risk_index'].min())

print("Aggregated and normalized data for mapping:")
display(df_map_data.head())
print("-" * 30, "\n")


# --- 4. Create the Full Interactive Map ---
print("--- Step 4: Generating the Full Interactive Map ---")
map_center = [33, 38]
risk_map_full = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB positron')

for _, row in df_map_data.iterrows():
    radius = 5 + (row['normalized_risk'] * 20)
    
    # Use a continuous color scale for better visualization
    color = plt.cm.viridis(row['normalized_risk'])
    
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=radius,
        color=f"rgb({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})",
        fill=True,
        fill_opacity=0.7,
        popup=folium.Popup(f"<b>{row['location_name_english']}</b><br>Avg. Risk Index: {row['composite_risk_index']:.6f}", max_width=300)
    ).add_to(risk_map_full)

output_path_full = os.path.join(FEATURE_DIR, 'risk_hotspot_map_FULL.html')
risk_map_full.save(output_path_full)

print(f"Full interactive map saved to: {output_path_full}")
print("You can open this HTML file in your browser.")

risk_map_full

ModuleNotFoundError: No module named 'folium'