This notebook provides a demonstration on how to clean a set of documents to prepare them for analysis. It was designed by Greta Frei to analyze articles published by the National Catholic Welfare Council from 1920 - 1950 hosted online by the Catholic News Archive. Questions can be directed to mrauch2@bu.edu. Cleaning is particular to a dataset, so not all the steps here may be necessary for your application or additional steps may be needed.

In [11]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import spacy
from concurrent.futures import ThreadPoolExecutor
from nltk.corpus import stopwords
from collections import defaultdict
import contractions

The OCR for the articles downloaded from CNA contains HTML tags. Each paragraph in the text references a different article on the page, not a different paragraph in a single article. Therefore, this method removes the html tags and keeps only paragraphs that contain China or Chinese. 

In [12]:
# Function to clean HTML tags and filter text
def clean_html_tags(text):
    global paragraphs_kept
    global paragraphs_total
    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = soup.find_all('p')
    cleaned_text = []
    # only keep paragraphs that reference China
    for paragraph in paragraphs:
        paragraphs_total += 1
        if re.search(r'\b(?:china|chinese)\b', paragraph.get_text(), re.IGNORECASE):
            cleaned_text.append(paragraph.get_text())
            paragraphs_kept += 1
    return ' '.join(cleaned_text)

This section of code reads in the original scrapped articles, filters out all other publications besides the NCWC bulliten (which has the id of cns), and filters by date. It then cleans up the OCR text by running the html cleaner method defined above. This code assumes that the data has already been downloaded and stored in a csv file called gathered_sections.csv

In [13]:
# Read the dataframe
df = pd.read_csv('gathered_sections.csv', index_col='section_id')

# Filter the dataframe
df = df[(df['publication_id'] == "cns")]
print(f'total cns articles: {len(df.index)}')

# Fix the dates
df['date'] = pd.to_datetime(df['date'])
df['publication_year'] = df['date'].dt.year

# Limit to articles published between 1920 and 1950
df = df[(df['publication_year'] >= 1920) & (df['publication_year'] <= 1950)]
print(f'total cns articles in year range: {len(df.index)}')

paragraphs_kept = 0
paragraphs_total = 0

# Apply the HTML cleaning function
df['ocr_text_cleaned'] = df['ocr_text'].apply(clean_html_tags)
# Drop the original OCR text column

df.drop(columns=['ocr_text'], inplace=True)

# Save the cleaned dataframe
df.to_csv('cns_articles_cleaned.csv')
print(f'Total paragraphs: {paragraphs_total} \nParagraphs kept: {paragraphs_kept}')

total cns articles: 19959
total cns articles in year range: 8428
Total paragraphs: 52761 
Paragraphs kept: 9443


The following dictionary is a list of terms to be replaced in each article. These terms include common abbreviations of countries and various Catholic organizations. They were developed organically in an interative process of inspecting output from the cleaning process. 

In [14]:
# Replace apprevations
replace_terms = {
    " chinee ": " china ",
    "catholic u ": "fujen ",
    "catholic u.": "fujen",
    "catholic u. of peking": "fujen",
    "peking catholic u.": "fujen",
    "peking cath. u.": "fujen",
    "peking u.": "fujen",
    "catholic u. of pekin ": "fujen ",
    "catholic university of pekin ": "fujen ",
    "catholic university of peking": "fujen ",
    "catholic university at peking": "fujen ",
    "news service": "news",
    "national catholic welfare council": "ncwc",
    "national catholic welfare committee": "ncwc",
    "national catholic welfare committee": "ncwc",
    "national catholic war committee": "ncwc",
    "national catholic war council": "ncwc",
    "n.c.w.c.": "ncwc",
    "n c w c": "ncwc",
    "n. c. w. c.": "ncwc",
    "n. c. iv. c.": "ncwc",
    "n. c. if. c.": "ncwc",
    'ncwc': "ncwc",
    'national council of catholic women': "nccw",
    'nccw': "nccw",
    'n.c.c.w.': "nccw",
    'cwc': "ccw",
    'c.w.c.': "ccw",
    'c. w. c.': "ccw",
    'c w c': "ccw",
    "u. n.": "unitednations",
    "u.n.": "unitednations",
    " un ": " unitednations ",
    " united nations ": " unitednations ",
    'o.p.': "dominican",
    'o. p.': "dominican",
    'c.s.c.': "holycross",
    'c. s. c.': "holycross",
    'c s c': "holycross",
    " cdl ": " cardinal ",
    " bp ": " bishop ",
    " st. ": " saint ",
    " st ": " saint ",
    " sr ": " sister ",
    " sr. ": " sister ",
    "chinese": "china",
    "japanese": "japan",
    "vietnamese": "vietnam",
    "hong kong": "hongkong",
    "u.s.a.": "usa",
    "u.s.": "usa",
    "u. s.": "usa",
    "united states of america": "usa",
    "united states": "usa",
    "americans": "usa",
    "american": "usa",
    "america": "usa",
    "america": "usa",
    "russians": "russia",
    "russian": "russia",
    "african": "africa",
    "indian": "india",
    "irish": "ireland",
    "belgian": "belgium",
    "canadian": "canada",
    "asians": "asia",
    "asian": "asia",
    "koreans": "korea",
    "korean": "korea",
    "spanish": "spain",
    "germans": "germany",
    "german": "germany",
    "french": "france",
    "british": "britain",
    "cubans": "cuba",
    "cuban": "cuba",
    "polish": "poland",
    "europeans": "europe",
    "european": "europe",
    "italians": "italy",
    "italian": "italy",
    "catholics ": "catholic ",
    "tibetan": "tibet",
    "mary knoller": "maryknoll",
    "mary knoll": "maryknoll",
    "maryknoller": "maryknoll",
    "mexicans": "mexico",
    "mexican": "mexico",
    "haitian": "haiti",
    "haitians": "haiti",
    " s.j. ": " jesuit ",
    " s j ": " jesuit ",
    " s. j. ": " jesuit ",
    "n.j.": "newjersey",
    "n. j.": "newjersey",
    "n.y.": "newyork",
    "n. y.": "newyork",
    " ny ": " newyork ",
    "c.p.": "passionist",
    "c. p.": "passionist",
    "s.v.d.": "svd",
    "s. v. d.": "svd",
    " svd ": " svd ",
    "k of c ": "kofc ",
    "greek": "greece",
    "greeks": "greece", 
    "washington d c": "washingtondc", 
    "washington d. c.": "washingtondc", 
    "washington d.c.": "washingtondc", 
    " rev ": " priest ",
    " fr ": " priest ",
    " rev. ": " priest ",
    " fr. ": " priest ",
    " hy ": " by ", 
    "yanllng": "yuanling",
    "passlonlst": "passionist",
    "passlonist": "passionist",
    "passionlst": "passionist",
    "jugoslavia" : "yugoslavia",
    "indiaapolis" : "indianapolis",
    "mlssioner" : "missioner",
    "chlna" : "china",
    "chima" : "china",
    " xl ": " xi ",
    "tslngtao": "tsingtao"
    }

The dictionary below is a list of words to be removed from each article (in addition to a standard list of space words). This list was developed by myself using the same process described above.

In [15]:
words_to_remove = {'million', 'thousand', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
    'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen',
    'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 
    'year', 'years', 'month', 'day', 'days', 'yearly', 'monthly', 'daily', 'week', 'weekly', 'days', 'years', 'months', 'weeks', 
    'today', 'yesterday', 'tomorrow', 'night', 'nights',
    'hour', 'hours', 'minute', 'minutes',
    'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eigth', 'ninth', 'tenth', 'eleventh', 
    "fifteenth",
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 
    'jan', 'feb', 'mar', 'apr', 'may', 'aug', 'sept', 'oct', 'nov', 'dec',
    'sursum', 'corda', 'halfh', 'thirty', 'ago', 'annual', 
    'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 
    'summer', 'winter', 'fall', 'spring', 
    'quarter', 'half', 'halves', 'quarters',
    'pm', 'am', 'cst', 'est',
    "tha", "th", "la", "ha", "bt", "hi", "ho", "hie", "aa", "ths", "bo", "lo", "le", "aad", "te", "bt", "ha", "ara", "ba", "tba",
    'tte', 'tbs', 'tbo', 'mm', 'ad', 'oa', 'ii', 'ia', 'af', 'oo', 'ar', 'ta', 'al', 'si', 'de', "tht", "tto", "bs", "tb", "ft", "iba",
    "hla", "wwa", "haa", "ao", "ib", "da", "ef", "tor", "aro", "baa", "oro", "froa", "toa", "ml", "lha", "mi", "ti", "ay", "ml", "wat", 
    "fea", "fro", "waa", "wa", "thl", "thalr", "lon", "ae", "bov", "ot", "wae", "", "nr", "ll", "na", "lev", "ua", "ro", "biabar", "oei",
    "thc", "oara", "hy", "iho", "oho", "bor", "tollo", "boro", "tta", "tod", "mu", "hor", "apo", "lata", "hwa",
    "tbe", "til", "ssi", "tnat", "flret", "sha", "hlo", "rov", "jt", "sal", "bf", "lia", "ono", "tj", "pari", "ei", "flrat", "dm", "rc",
    "wr", "har", "pl", "aald", "flr", "ke", "cu", "ub", "pln", "ser", "thia", "val", "burl", "ji", "nas", "les", "oc", "der", "ul", "ec", "fh",
    "vl", "sn", "mw", "tut", "ru", "ber", "fhe", "pt", "ov", "ssr", "ret", "ds", "iw", "mac", "kt", "ag", "els", "je", "hoy", "loa", "hls",
    "ami", "ow", "ko", "jm", "wno", "mn", "tw", "nf", "nh", "inf", "ior", "ew", "fer", "oot", "hj", "bn", "wn", "patna", "jh", "ij", "lh",
    "nal", "lira", "ste", "prl", "und", "pal", "js", "ge", "pj", "mae", "cd", "oen", "tim", "ity", "tp", "ond", "eald", "ilf", "ito", "uc",
    "pp", "nm", "taka", "aay", "ort", "iy", "lng", "iff", "ga", "wt", "sv", "croe", "laat", "iha", "thr", "tro", "sy", "sac", "bv", "rea",
    "ame", "jp", "uu", "oon", "ud", "yoar", "td", "fj", "nev", "tse", "hn", "fs", "esse", "xt", "mj", "fre", "fha", "wor", "jn"
}


The following code defines the method *preprocess_entity*. This method cleans the text of one given article. 

In [16]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")
stop_words = set(nlp.Defaults.stop_words)

# preproccesing a single article
def preprocess_entity(article):
    # print(f'---- Original Article ----\n{article}\n------------------')
    article = re.sub(r'\s{2,}', ' ', article)
    article = re.sub(r'([A-Z]\. ?)+([A-Z])\. ?', lambda match: match.group().replace(".", "").replace(" ", "") + " ", article)
    article = article.lower().strip()
    
    article = contractions.fix(article)

    # Replace terms in the string
    for key, value in replace_terms.items():
        article = article.replace(key, value)

    article = re.sub(r'[^A-Za-z\_]', ' ', article) 
    article = re.sub(r'\s{2,}', ' ', article)
    
    # SpaCy tokenization
    doc = nlp(article)

    # Lemmatization and remove stopwords
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop]
    article = ' '.join(processed_tokens)

    # Remove stop words and words of length 1
    article = ' '.join(word for word in article.split() 
                       if word not in stop_words and 
                       len(word) > 1) 
    original_word_count = len(article.split())
    
    # Remove redundant words
    
    # hcvc, ncnc, iccw, nowc, usgr, nctc 
    article = ' '.join(word for word in article.split() 
                     if word not in words_to_remove)
    remaining_word_count = len(article.split())
    
    # Get rid of the article if too many words were removed
    if (original_word_count == 0) or (100 * remaining_word_count / original_word_count) < 50:
        return None
    
    # Remove punctuation in abbreviations
    article = re.sub(r'\b([A-Z]\. ?)+\b', lambda match: match.group().replace('.','').replace(' ',''), article)
    # print(f'---- Final Article ----\n{article}\n------------------')
    return article

def clean_article(article):
    clean_article = preprocess_entity(article)
    return clean_article

The preprocessing method is run on each article in the corpus.

In [17]:
# Use ThreadPoolExecutor for# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
    preprocessed_articles = list(executor.map(preprocess_entity, df['ocr_text_cleaned']))
df['preprocessed_article'] = preprocessed_articles

print(f'original number of articles: {len(df.index)}')
df = df.dropna(subset=['preprocessed_article'], axis=0)
print(f'final number of articles after word replacement: {len(df.index)}')
with ThreadPoolExecutor() as executor:
    preprocessed_articles = list(executor.map(preprocess_entity, df['ocr_text_cleaned']))
df['preprocessed_article'] = preprocessed_articles

print(f'original number of articles: {len(df.index)}')
df = df.dropna(subset=['preprocessed_article'], axis=0)
print(f'final number of articles after word replacement: {len(df.index)}')

A frequency dictionary is built for occurances of words throughout the entire corpus. Words with low frequency are deleted. Articles that had too many words deleted are removed from the corpus. Then, the final data frame is saved.

In [None]:
# Compute frequencies
FREQ_MIN = 10
frequency = defaultdict(int)
for text in df['preprocessed_article']:
    for token in text.split(' '):
        frequency[token] += 1

# Filter tokens with frequency less than FREQ_MIN
frequency_filtered = {token: freq for token, freq in frequency.items() if freq <= FREQ_MIN}

# Convert the filtered frequency dictionary to a DataFrame
df_freq = pd.DataFrame(list(frequency_filtered.items()), columns=['token', 'frequency'])
df_freq.to_csv('words_deleted.csv')

def delete_low_freq_words(article):
    global frequency_filtered
    original_word_count = len(article.split())
    article = ' '.join(word for word in article.split() if word not in frequency_filtered)
    remaining_word_count = len(article.split())
    
    # Get rid of the article if too many words were removed
    if (original_word_count == 0) or (100 * remaining_word_count / original_word_count) < 50:
        return None
    
    return article

with ThreadPoolExecutor() as executor:
    preprocessed_articles = list()
    cleaned_articles = list(executor.map(delete_low_freq_words, df['preprocessed_article']))

df['preprocessed_article'] = cleaned_articles
df = df.dropna(subset=['preprocessed_article'], axis=0)
print(f'final number of articles after word frequency test: {len(df.index)}')

df.to_csv('cns_preprocessed_articles.csv')