# Summary from EDA and next steps. 

#### Remove duplicate entries
#### Handle missing values
#### Filter or correct suspicious URLs
#### Normalize and clean titles
#### Drop or impute null-like text values
#### Standardize and validate categorical fields

In [65]:
# imports 
import pandas as pd
from urllib.parse import urlparse
import tldextract
import numpy as np
import requests
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import unicodedata
import re
import spacy
from transformers import pipeline

tqdm.pandas()

from utils import *


In [46]:
data = pd.read_csv('Nat Cat Events.csv')
data['seendate'] = pd.to_datetime(data['seendate'], format='%Y%m%dT%H%M%SZ', errors='coerce')

I will be removing duplicated urls and titiles as in real world news dataset it might indicate syndicated content across news outlets, re-posts of the same story, duplicate scraping or ingestion

In [47]:
# Step 1: Remove full-row duplicates
before = data.shape[0]
data = data.drop_duplicates()
after_full = data.shape[0]
print(f"Removed {before - after_full} exact duplicate rows.")

# Step 2: Remove duplicate URLs (keep first occurrence)
before_url = data.shape[0]
data = data.drop_duplicates(subset='url')
after_url = data.shape[0]
print(f"Removed {before_url - after_url} duplicate URLs.")

# Step 3: Remove duplicate titles (keep first occurrence)
before_title = data.shape[0]
data = data.drop_duplicates(subset='title')
after_title = data.shape[0]
print(f"Removed {before_title - after_title} duplicate titles.")

print(f"Final remaining rows: {data.shape[0]}")

Removed 2176 exact duplicate rows.
Removed 0 duplicate URLs.
Removed 24144 duplicate titles.
Final remaining rows: 65159


In [48]:
# Apply to your data
domain_parts = data['domain'].apply(extract_domain_parts)
data = pd.concat([data, domain_parts], axis=1)

In [49]:
# Define null-like values
null_like = ['none', 'null', 'NaN', 'nan', '', '-', 'n/a', 'unknown']
# Columns to clean
cat_cols = ['language', 'sourcecountry', 'subdomain', 'domain_root', 'tld']
# Replace them with actual np.nan
for col in cat_cols:
    data[col] = data[col].replace(null_like, np.nan, regex=True)

# Flag and View Suspecious URLs

In [50]:
# Flag potentially suspicious URLs by keywords
pattern = r'404|notfound|error|invalid|missing'
suspicious_mask = data['url'].str.contains(pattern, case=False, na=False)
suspicious_urls = data[suspicious_mask]['url']
print('this is the number of suspicious urls ', len(suspicious_urls))

this is the number of suspicious urls  633


In [51]:
accessible_mask = suspicious_urls.progress_apply(is_url_accessible)

# Filter out inaccessible suspicious URLs
urls_to_remove = suspicious_urls[~accessible_mask]
print(f"Removing {len(urls_to_remove)} truly inaccessible URLs.")
data = data[~data['url'].isin(urls_to_remove)]


Checking accessibility of suspicious URLs...


100%|██████████| 633/633 [06:35<00:00,  1.60it/s]


Removing 220 truly inaccessible URLs.


In [52]:
data.shape

(64939, 11)

# Processing Titles

In [53]:
# Check for missing titiles
empty_titles = data[data['title'].isna() | (data['title'].str.strip() == '')]
print(f"Empty title rows: {len(empty_titles)}")

Empty title rows: 1


In [54]:
# Identify empty titles
empty_mask = data['title'].isna() | (data['title'].str.strip() == '')
empty_titles = data[empty_mask]
print(f"Empty title rows: {len(empty_titles)}")

# Attempt to recover titles
print("Attempting to recover titles from URLs using <title> and <h1>/<h2>...")
recovered_titles = empty_titles['url'].progress_apply(get_best_title_from_url)
data.loc[empty_titles.index, 'title'] = recovered_titles

# Count what was recovered
recovered_count = recovered_titles.notna().sum()
still_empty_count = recovered_titles.isna().sum()
print(f"Recovered {recovered_count} titles from URLs.")
print(f"{still_empty_count} rows still have empty titles and will be dropped.")

# Drop rows still missing titles
data = data


Empty title rows: 1
Attempting to recover titles from URLs using <title> and <h1>/<h2>...


100%|██████████| 1/1 [00:00<00:00,  2.05it/s]

Recovered 0 titles from URLs.
1 rows still have empty titles and will be dropped.

🔍 Sample recovered titles:
Series([], Name: url, dtype: object)





In [55]:
# Corrupted / Non-English Titles 
non_ascii_mask = ~data['title'].apply(lambda x: isinstance(x, str) and x.isascii())
print("Non-ASCII/Corrupted Titles Found:", non_ascii_mask.sum())
print(data[non_ascii_mask]['title'].head())


Non-ASCII/Corrupted Titles Found: 3095
58                Magnitude 4 . 3 earthquake jolts Nepal
103    Japan issues tsunami alert after series of str...
138    Japan earthquake – live : Major tsunami warnin...
203    Major tsunami hits Japan coast amid magnitude ...
Name: title, dtype: object


These titles are relevant and valid, but they were likely flagged as “corrupted” because they contain non-ASCII characters, such as:

Türkiye → contains a special ü

– → is an en dash, not a simple hyphen -

’ → a curly quote instead of '

… or em dashes —, special accents, etc.
Rather than removing these rows, you can normalize or clean these characters:

I will be using Unicode normalization with unicodedata
This converts curly quotes, accented letters, en dashes, etc. to ASCII where possible.


In [56]:
# Apply normalisation to the title (replacing non english characters)
data['normalized_title'] = data['title'].apply(normalize_unicode)


In [57]:
# Check if there are any Corrupted / Non-English Titles left
non_ascii_mask = ~data['normalized_title'].apply(lambda x: isinstance(x, str) and x.isascii())
print("Non-ASCII/Corrupted Titles Found:", non_ascii_mask.sum())
# so now all titles are normalised

Non-ASCII/Corrupted Titles Found: 0


In [59]:
data['clean_title'] = data['normalized_title'].apply(clean_title)

In [60]:
# Named Entity Recognition (NER) for Location using Spacy
nlp = spacy.load("en_core_web_sm")  # lightweight but effective for GPE/LOC

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    return locations if locations else None

data['locations'] = data['clean_title'].progress_apply(extract_location)
data['has_location'] = data['locations'].apply(lambda x: bool(x))

100%|██████████| 64939/64939 [14:37<00:00, 74.02it/s] 


In [61]:
# Extracting nat_cat events from titles useing exact keyword match
natcat_keywords = [
    'earthquake', 'flood', 'storm', 'tornado', 'hurricane', 
    'landslide', 'tsunami', 'volcano', 'wildfire', 'mudslide', 'eruption'
]

def natcat_keyword_info(text):
    text_lower = text.lower()
    for keyword in natcat_keywords:
        if keyword in text_lower:
            return pd.Series([True, keyword])
    return pd.Series([False, None])

data[['has_natcat_keyword', 'matched_keyword']] = data['clean_title'].apply(natcat_keyword_info)

In [62]:
# Count how many rows satisfy BOTH conditions
rule_based_matches = data[data['has_location'] & data['has_natcat_keyword']]
print(f"Rule-based matches: {len(rule_based_matches)}")

# Create a mask for rows NOT matching both
not_matched_mask = ~(data['has_location'] & data['has_natcat_keyword'])
to_classify = data[not_matched_mask]
print(f"Rows needing smart classification: {len(to_classify)}")

Rule-based matches: 20599
Rows needing smart classification: 44340


In [64]:
# Will be saving that spacy_and_exact_match processed data for further usage 
rule_based_matches.to_csv('rule_based_nat_cat_4.csv', index=False)

# Summary of this notebook:
## Data Cleaning & Feature Engineering for Natural Disaster Event Detection
This pipeline focuses on preparing and enriching news/event data to support downstream prediction of natural disaster-related content.
 1. Data Deduplication
Removed full-row duplicates, then dropped repeated url and title entries.

Reduced redundancy and ensured each row represents a unique event.

2. Domain Parsing & Categorical Cleaning
Extracted subdomain, domain_root, and tld from domain using tldextract.

Standardized null-like values (e.g., 'none', 'n/a') across categorical features like language and sourcecountry.

3. URL Quality Checks
Flagged and removed suspicious or broken URLs based on keywords (404, error, etc.) and HTTP accessibility checks.

4. Title Normalization & Recovery
Recovered missing titles using header and title, etc. from webpage content (requests, BeautifulSoup).

Removed non-ASCII characters and normalized text to produce clean_title.

5. Location Extraction (NER)
Used spaCy to extract geographic entities (e.g., cities, countries) from clean_title.

Added binary has_location and locations columns.

6. Natural Disaster Keyword Matching
Identified mentions of events like earthquake, flood, wildfire, etc.

Created features: has_natcat_keyword and matched_keyword.

Outcome & Next Steps
Rule-based matches (with both location and disaster keyword): 20,599 rows

Remaining for ML classification: 44,340 rows
