# Banking Dive Fraud Analysis

Quick exploratory notebook to derive project insights, keyword trends, and fraud patterns for the Banking Dive data.


In [1]:
from pathlib import Path
import pandas as pd
import re
from collections import Counter
from itertools import islice

DATA_DIR = Path('data')
articles_path = DATA_DIR / 'banking_dive_articles.csv'
fraud_articles_path = DATA_DIR / 'banking_dive_articles_with_fraud.csv'
fraud_only_path = DATA_DIR / 'banking_dive_fraud_articles.csv'

articles_df = pd.read_csv(articles_path)
fraud_augmented_df = pd.read_csv(fraud_articles_path)
fraud_only_df = pd.read_csv(fraud_only_path)

articles_df.head()


Unnamed: 0,title,content,link,publish_date,topics
0,Revolut valued at $75B,Revolut is valued at $75 billion after complet...,https://www.bankingdive.com/news/revolut-value...,,
1,"Comerica, Fifth Third sued by activist investor",Just days after threatening legal action again...,https://www.bankingdive.com/news/comerica-fift...,,
2,Fulton Financial to buy NJ’s Blue Foundry in $...,Fulton Financial Corp.will acquire Blue Foundr...,https://www.bankingdive.com/news/fulton-financ...,,
3,MoneyLion to pay $1.75M to settle CFPB lawsuit,MoneyLion agreed Friday topay $1.75 millionto ...,https://www.bankingdive.com/news/cfpb-moneylio...,,
4,U.S. Bank hires up for greater Southeast growth,After adding bankers to serve businesses in Da...,https://www.bankingdive.com/news/us-bank-addin...,,


In [2]:
summary = {
    'total_articles': len(articles_df),
    'fraud_augmented_rows': len(fraud_augmented_df),
    'fraud_only_rows': len(fraud_only_df),
    'unique_topics': articles_df['topics'].nunique(),
    'unique_detected_fraud_categories': fraud_augmented_df['detected_fraud_category'].nunique(),
}
summary


{'total_articles': 200,
 'fraud_augmented_rows': 200,
 'fraud_only_rows': 2,
 'unique_topics': 0,
 'unique_detected_fraud_categories': 2}

In [3]:
fraud_records = fraud_augmented_df[fraud_augmented_df['detected_fraud_category'].notna()].copy()
fraud_records['detected_fraud_category'] = fraud_records['detected_fraud_category'].fillna('Unlabeled')
category_counts = fraud_records['detected_fraud_category'].value_counts().head(10)
risk_counts = fraud_records['risk_level'].fillna('Unknown').value_counts()
category_counts, risk_counts


(detected_fraud_category
 Wire Fraud, Check Fraud    8
 Money Laundering           8
 Name: count, dtype: int64,
 risk_level
 Low     8
 High    8
 Name: count, dtype: int64)

In [4]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stopwords = set(ENGLISH_STOP_WORDS)
text = ' '.join(fraud_records['content'].dropna()).lower()
words = re.findall(r"[a-zA-Z']+", text)
filtered_words = [w for w in words if w not in stopwords and len(w) > 3]
word_counts = Counter(filtered_words)
top_words = word_counts.most_common(10)

def ngrams(tokens, n=2):
    for i in range(len(tokens) - n + 1):
        yield ' '.join(tokens[i:i+n])

filtered_tokens = [w for w in re.findall(r"[a-zA-Z']+", text) if w not in stopwords and len(w) > 3]
bigram_counts = Counter(ngrams(filtered_tokens, 2))
top_bigrams = bigram_counts.most_common(10)

top_words, top_bigrams


([('chinese', 184),
  ('bank', 168),
  ('check', 104),
  ('said', 104),
  ('employees', 72),
  ('plaintiffs', 72),
  ('american', 64),
  ('truist', 56),
  ('prosecutors', 56),
  ('fraud', 56)],
 [('chinese american', 64),
  ('prosecutors said', 48),
  ('chinese chinese', 48),
  ('american employees', 40),
  ('plaintiffs said', 32),
  ('check cashing', 24),
  ('check information', 24),
  ('commit bank', 24),
  ('bank fraud', 24),
  ('south carolina', 24)])

In [5]:
fraud_records['publish_date'] = pd.to_datetime(fraud_records['publish_date'])
fraud_records['year_month'] = fraud_records['publish_date'].dt.to_period('M')
monthly_counts = fraud_records.groupby('year_month').size().tail(12)
monthly_counts


Series([], Freq: M, dtype: int64)