# Banking Dive Article Scraper and Fraud Detector

This notebook scrapes articles from Banking Dive and detects fraud-related content for analysis.

## Overview
- **Step 1**: Scrape 200 articles from bankingdive.com
- **Step 2**: Detect fraud-related articles and categorize them
- **Output**: CSV files ready for Supabase upload


In [57]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from datetime import datetime
import time
import re
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)


## Step 1: Web Scraping

This section scrapes articles from Banking Dive. We'll collect:
- Article title
- Full article content
- Article link
- Publish date
- Topics/tags


In [58]:
def scrape_article_details(article_url, headers):
    """
    Scrapes detailed information from an individual article page.
    
    Args:
        article_url: URL of the article to scrape
        headers: HTTP headers to use for the request
    
    Returns:
        Dictionary with article details or None if scraping fails
    """
    try:
        response = requests.get(article_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract full article content
        content_parts = []
        # Look for article body content
        article_body = soup.find('div', class_='article-body') or soup.find('div', class_='article-content')
        if article_body:
            paragraphs = article_body.find_all('p')
            content_parts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        # If no article-body found, try alternative selectors
        if not content_parts:
            content_div = soup.find('div', {'data-module': 'ArticleBody'}) or soup.find('div', class_='article-text')
            if content_div:
                paragraphs = content_div.find_all('p')
                content_parts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        full_content = ' '.join(content_parts)
        
        # Extract publish date - look for published-info class first
        publish_date = None
        # First try the published-info class (most specific)
        date_element = soup.find('div', class_='published-info')
        
        if date_element:
            # Extract text like "Published Nov. 21, 2025"
            date_text = date_element.get_text(strip=True)
            # Remove "Published" prefix if present
            if date_text.lower().startswith('published'):
                date_text = date_text[9:].strip()  # Remove "Published" (9 chars)
            publish_date = date_text
        
        # If not found, try alternative selectors
        if not publish_date:
            date_element = (soup.find('time') or 
                           soup.find('span', class_='date') or 
                           soup.find('div', class_='article-date') or
                           soup.find('span', class_='publish-date'))
            
            if date_element:
                if date_element.get('datetime'):
                    publish_date = date_element.get('datetime')
                else:
                    publish_date = date_element.get_text(strip=True)
        
        # Extract topics/tags
        topics = []
        # Look for topic tags
        topic_elements = soup.find_all('a', class_='tag') or soup.find_all('a', class_='topic-tag')
        if not topic_elements:
            # Try alternative selectors
            topic_section = soup.find('div', class_='topics') or soup.find('div', class_='tags')
            if topic_section:
                topic_elements = topic_section.find_all('a')
        
        for topic_elem in topic_elements:
            topic_text = topic_elem.get_text(strip=True)
            if topic_text:
                topics.append(topic_text)
        
        # If no topics found, try meta tags
        if not topics:
            meta_topics = soup.find_all('meta', {'property': 'article:tag'})
            topics = [meta.get('content', '') for meta in meta_topics if meta.get('content')]
        
        return {
            'content': full_content,
            'publish_date': publish_date,
            'topics': ', '.join(topics) if topics else ''
        }
    
    except Exception as e:
        print(f"Error scraping article {article_url}: {e}")
        return None


In [59]:
def scrape_banking_dive(num_articles=200):
    """
    Scrapes articles from Banking Dive website.
    
    Args:
        num_articles: Number of articles to scrape (default: 200)
    
    Returns:
        List of dictionaries containing article information
    """
    base_url = "https://www.bankingdive.com/news/"
    articles_data = []
    seen_links = set()  # Track unique article links to avoid duplicates
    page_num = 1
    
    # Set headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    print(f"Starting scraper... targeting {num_articles} articles.\n")
    
    try:
        while len(articles_data) < num_articles:
            # Construct URL for current page
            current_url = f"{base_url}?page={page_num}"
            print(f"Fetching page {page_num}: {len(articles_data)}/{num_articles} articles collected")
            
            # Make HTTP request
            response = requests.get(current_url, headers=headers, timeout=10)
            response.raise_for_status()
            
            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all article items
            articles = soup.find_all('li', class_='row feed__item')
            
            # Filter out advertisements
            articles = [article for article in articles if 'feed-item-ad' not in article.get('class', [])]
            
            if not articles:
                print("No more articles found. Stopping.")
                break
            
            # Process each article
            for article in articles:
                if len(articles_data) >= num_articles:
                    break
                
                # Find title and link
                title_element = article.find('h3', class_='feed__title')
                if title_element:
                    title_link = title_element.find('a')
                    if title_link:
                        title = title_link.get_text(strip=True)
                        article_link = title_link.get('href', '')
                        
                        # Make sure link is complete
                        if article_link and not article_link.startswith('http'):
                            article_link = f"https://www.bankingdive.com{article_link}"
                        
                        # Check if we've already seen this article (avoid duplicates)
                        if article_link in seen_links:
                            print(f"  Skipping duplicate: {title[:60]}...")
                            continue
                        
                        seen_links.add(article_link)
                        
                        # Get summary from listing page
                        summary_element = article.find('p', class_='feed__description')
                        summary = summary_element.get_text(strip=True) if summary_element else ''
                        
                        # Scrape detailed article information
                        print(f"  Scraping: {title[:60]}...")
                        details = scrape_article_details(article_link, headers)
                        
                        # Create article data - use details if available, otherwise use summary
                        article_data = {
                            'title': title,
                            'content': details['content'] if details and details['content'] else summary,
                            'link': article_link,
                            'publish_date': details['publish_date'] if details and details['publish_date'] else '',
                            'topics': details['topics'] if details and details['topics'] else ''
                        }
                        articles_data.append(article_data)
                        
                        # Be polite - add a small delay between requests
                        time.sleep(0.5)
            
            page_num += 1
            
            # Safety check to avoid infinite loops
            if page_num > 50:  # Adjust based on typical page count
                print("Reached maximum page limit. Stopping.")
                break
    
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    
    print(f"\n--- Successfully collected {len(articles_data)} articles ---")
    return articles_data


In [60]:
# Run the scraper to collect 200 articles
articles = scrape_banking_dive(num_articles=400)


Starting scraper... targeting 400 articles.

Fetching page 1: 0/400 articles collected
  Scraping: Revolut valued at $75B...
  Scraping: Comerica, Fifth Third sued by activist investor...
  Scraping: Fulton Financial to buy NJ’s Blue Foundry in $243M deal...
  Scraping: MoneyLion to pay $1.75M to settle CFPB lawsuit...
  Scraping: U.S. Bank hires up for greater Southeast growth...
  Scraping: VALT Bank applies for de novo charter...
  Scraping: CFPB can’t ignore judge’s order: workers union...
  Scraping: Stripe faces bank charter pushback...
  Scraping: How the open banking rule skidded...
  Scraping: Ex-Truist employee sentenced in ‘complex’ check-cashing sche...
  Scraping: Citi CFO Mark Mason to step down in March...
  Scraping: Wells Fargo taps Saul Van Beurden to scale AI...
  Scraping: CFPB is transferring its cases to DOJ...
  Scraping: TD illegally targeted, fired Chinese-heritage employees: law...
  Scraping: Truist taps Microsoft alum as AI chief...
  Scraping: Capital One j

In [61]:
# Convert to DataFrame and save to CSV
df_articles = pd.DataFrame(articles)

# Remove any duplicates based on article link (safety check)
initial_count = len(df_articles)
df_articles = df_articles.drop_duplicates(subset=['link'], keep='first')
duplicates_removed = initial_count - len(df_articles)
if duplicates_removed > 0:
    print(f"Removed {duplicates_removed} duplicate articles based on link")

# Clean the data - remove newlines and extra whitespace
df_articles['title'] = df_articles['title'].str.replace('\n', ' ').str.replace('\r', '').str.strip()
df_articles['content'] = df_articles['content'].str.replace('\n', ' ').str.replace('\r', '').str.strip()
df_articles['topics'] = df_articles['topics'].str.replace('\n', ' ').str.replace('\r', '').str.strip()

# Save to CSV
output_file = DATA_DIR / 'banking_dive_articles.csv'
df_articles.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"Saved {len(df_articles)} unique articles to {output_file}")
print(f"\nColumns: {list(df_articles.columns)}")
print(f"\nFirst few rows:")
df_articles.head()


Saved 25 unique articles to data\banking_dive_articles.csv

Columns: ['title', 'content', 'link', 'publish_date', 'topics']

First few rows:


Unnamed: 0,title,content,link,publish_date,topics
0,Revolut valued at $75B,Revolut is valued at $75 billion after complet...,https://www.bankingdive.com/news/revolut-value...,,
1,"Comerica, Fifth Third sued by activist investor",Just days after threatening legal action again...,https://www.bankingdive.com/news/comerica-fift...,,
2,Fulton Financial to buy NJ’s Blue Foundry in $...,Fulton Financial Corp.will acquire Blue Foundr...,https://www.bankingdive.com/news/fulton-financ...,,
3,MoneyLion to pay $1.75M to settle CFPB lawsuit,MoneyLion agreed Friday topay $1.75 millionto ...,https://www.bankingdive.com/news/cfpb-moneylio...,,
4,U.S. Bank hires up for greater Southeast growth,After adding bankers to serve businesses in Da...,https://www.bankingdive.com/news/us-bank-addin...,,


## Step 2: Fraud Detection

This section filters articles for fraud-related content and adds:
- Detected fraud category (if applicable)
- Risk level (if applicable)

Articles that don't match fraud keywords will have None values for these fields.


In [62]:
# Define fraud-related keywords and categories
fraud_keywords = {
    'Payment Fraud': ['payment fraud', 'card fraud', 'credit card fraud', 'debit fraud', 'transaction fraud'],
    'Identity Theft': ['identity theft', 'identity fraud', 'stolen identity', 'identity breach'],
    'Account Takeover': ['account takeover', 'unauthorized access', 'account breach', 'hacked account'],
    'Phishing': ['phishing', 'phishing attack', 'email scam', 'phishing scheme'],
    'Wire Fraud': ['wire fraud', 'wire transfer fraud', 'wire scam'],
    'Check Fraud': ['check fraud', 'check kiting', 'forged check', 'fake check'],
    'Loan Fraud': ['loan fraud', 'mortgage fraud', 'loan scam', 'fraudulent loan'],
    'Money Laundering': ['money laundering', 'laundering', 'aml violation', 'anti-money laundering'],
    'Insider Fraud': ['insider fraud', 'employee fraud', 'internal fraud', 'insider threat'],
    'Synthetic Identity': ['synthetic identity', 'synthetic fraud', 'fake identity'],
    'Business Email Compromise': ['business email compromise', 'bec fraud', 'email compromise'],
    'ATM Fraud': ['atm fraud', 'atm skimming', 'atm scam']
}

# Risk level keywords (higher risk = more severe)
risk_keywords = {
    'High': ['millions', 'billion', 'major breach', 'large-scale', 'widespread', 'systemic', 
             'regulatory action', 'fined', 'lawsuit', 'criminal charges'],
    'Medium': ['thousands', 'significant', 'investigation', 'alert', 'warning', 'concern'],
    'Low': ['potential', 'possible', 'risk', 'threat', 'monitoring', 'awareness']
}


In [63]:
def detect_fraud_category(text):
    """
    Detects fraud category based on keywords in the article text.
    
    Args:
        text: Article title and content combined
    
    Returns:
        String with fraud category or None
    """
    if pd.isna(text) or text == '':
        return None
    
    text_lower = text.lower()
    detected_categories = []
    
    # Check each fraud category
    for category, keywords in fraud_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text_lower:
                detected_categories.append(category)
                break  # Only add category once
    
    if detected_categories:
        # Return the first detected category, or combine if multiple
        return ', '.join(detected_categories[:2])  # Limit to 2 categories max
    return None


def detect_risk_level(text):
    """
    Detects risk level based on keywords in the article text.
    
    Args:
        text: Article title and content combined
    
    Returns:
        String with risk level ('High', 'Medium', 'Low') or None
    """
    if pd.isna(text) or text == '':
        return None
    
    text_lower = text.lower()
    risk_scores = {'High': 0, 'Medium': 0, 'Low': 0}
    
    # Check for risk level keywords
    for level, keywords in risk_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text_lower:
                risk_scores[level] += 1
    
    # Determine risk level (prioritize higher risk)
    if risk_scores['High'] > 0:
        return 'High'
    elif risk_scores['Medium'] > 0:
        return 'Medium'
    elif risk_scores['Low'] > 0:
        return 'Low'
    return None


In [64]:
# Apply fraud detection to all articles
# Combine title and content for analysis
df_articles['combined_text'] = df_articles['title'].fillna('') + ' ' + df_articles['content'].fillna('')

# First detect fraud category
df_articles['detected_fraud_category'] = df_articles['combined_text'].apply(detect_fraud_category)

# Only apply risk level detection to fraud-detected articles
# For articles without fraud category, risk_level will be None
def detect_risk_level_for_fraud_articles(row):
    """
    Only assigns risk level if article has a detected fraud category.
    Otherwise returns None.
    """
    if pd.isna(row['detected_fraud_category']) or row['detected_fraud_category'] == '':
        return None
    # Only check High and Medium keywords for fraud articles
    text_lower = row['combined_text'].lower()
    risk_scores = {'High': 0, 'Medium': 0}
    
    # Check for High and Medium risk keywords only
    for level in ['High', 'Medium']:
        for keyword in risk_keywords[level]:
            if keyword.lower() in text_lower:
                risk_scores[level] += 1
    
    # Determine risk level (prioritize higher risk)
    if risk_scores['High'] > 0:
        return 'High'
    elif risk_scores['Medium'] > 0:
        return 'Medium'
    else:
        return 'Low'  # Fraud article but no high/medium keywords = Low

df_articles['risk_level'] = df_articles.apply(detect_risk_level_for_fraud_articles, axis=1)

# Drop the temporary combined_text column
df_articles = df_articles.drop('combined_text', axis=1)

# Show summary
fraud_count = df_articles['detected_fraud_category'].notna().sum()
print(f"Found {fraud_count} articles with fraud-related content out of {len(df_articles)} total articles")
print(f"\nFraud category breakdown:")
print(df_articles['detected_fraud_category'].value_counts())
print(f"\nRisk level breakdown:")
print(df_articles['risk_level'].value_counts())


Found 2 articles with fraud-related content out of 25 total articles

Fraud category breakdown:
detected_fraud_category
Wire Fraud, Check Fraud    1
Money Laundering           1
Name: count, dtype: int64

Risk level breakdown:
risk_level
Low     1
High    1
Name: count, dtype: int64


In [65]:
# Save the complete dataset with fraud detection to CSV
output_file_fraud = DATA_DIR / 'banking_dive_articles_with_fraud.csv'
df_articles.to_csv(output_file_fraud, index=False, encoding='utf-8-sig')
print(f"Saved complete dataset with fraud detection to {output_file_fraud}")

# Also create a filtered CSV with only fraud-related articles
df_fraud_only = df_articles[df_articles['detected_fraud_category'].notna()].copy()
if len(df_fraud_only) > 0:
    output_file_fraud_only = DATA_DIR / 'banking_dive_fraud_articles.csv'
    df_fraud_only.to_csv(output_file_fraud_only, index=False, encoding='utf-8-sig')
    print(f"Saved {len(df_fraud_only)} fraud-related articles to {output_file_fraud_only}")
else:
    print("No fraud-related articles found to save separately.")

# Display sample of fraud-detected articles
print(f"\nSample of fraud-detected articles:")
if len(df_fraud_only) > 0:
    display(df_fraud_only[['title', 'detected_fraud_category', 'risk_level']].head(10))
else:
    print("No fraud-related articles detected.")


Saved complete dataset with fraud detection to data\banking_dive_articles_with_fraud.csv
Saved 2 fraud-related articles to data\banking_dive_fraud_articles.csv

Sample of fraud-detected articles:


Unnamed: 0,title,detected_fraud_category,risk_level
9,Ex-Truist employee sentenced in ‘complex’ chec...,"Wire Fraud, Check Fraud",Low
13,"TD illegally targeted, fired Chinese-heritage ...",Money Laundering,High


## Summary

The scraping and fraud detection is complete.

1. **data/banking_dive_articles.csv** - All scraped articles with original data
2. **data/banking_dive_articles_with_fraud.csv** - All articles with fraud detection columns added
3. **data/banking_dive_fraud_articles.csv** - Only fraud-related articles (if any were detected)

These CSV files are ready to be uploaded to Supabase for dashboard analysis.


In [66]:
#counting all fraud related articles
import pandas as pd

# Read the CSV (note: typo fixed to match requested filename, but likely should be 'banking_dive_articles_with_fraud.csv'):
fraud_csv = DATA_DIR / 'banking_dive_articles_with_fraud.csv'

# Load data
df = pd.read_csv(fraud_csv)

# Filter fraud-related articles
df_fraud = df[df['detected_fraud_category'].notna()].copy()

# Show high-level stats
print("Total articles with fraud-related content:", len(df_fraud))
print("\nFraud category frequency:")
print(df_fraud['detected_fraud_category'].value_counts())

print("\nRisk level count:")
print(df_fraud['risk_level'].value_counts())

print("\nTop 5 titles of fraud-related articles:")
print(df_fraud['title'].head(5).to_string(index=False))

print("\nFraud article table sample:")
display(df_fraud.head(10))


Total articles with fraud-related content: 2

Fraud category frequency:
detected_fraud_category
Wire Fraud, Check Fraud    1
Money Laundering           1
Name: count, dtype: int64

Risk level count:
risk_level
Low     1
High    1
Name: count, dtype: int64

Top 5 titles of fraud-related articles:
Ex-Truist employee sentenced in ‘complex’ check...
TD illegally targeted, fired Chinese-heritage e...

Fraud article table sample:


Unnamed: 0,title,content,link,publish_date,topics,detected_fraud_category,risk_level
9,Ex-Truist employee sentenced in ‘complex’ chec...,A former Truist branch employee is one of eigh...,https://www.bankingdive.com/news/ex-truist-emp...,,,"Wire Fraud, Check Fraud",Low
13,"TD illegally targeted, fired Chinese-heritage ...",Five former TD Bank employees of Chinese herit...,https://www.bankingdive.com/news/td-illegally-...,,,Money Laundering,High
