# Cleaning the Fake News Corpus

In [34]:
# Our imports
import pandas as pd
import nltk
from cleantext.clean import clean
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
import swifter
import ast

# initialize tqdm
tqdm.pandas()

In [None]:
# import the data
df_995 = pd.read_csv('data/995,000_rows.csv')

In [36]:
# Functoin to clean text using clean-text library
# This function will remove non-ascii characters, convert to lowercase, 
# remove line breaks, and remove non-english text
def clean_space(txt):
    return clean(str(txt), 
                 fix_unicode=True, 
                 to_ascii=True, 
                 lower=True, 
                 no_line_breaks=True,
                 lang="en")
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(clean_space)

# Save the new version of cleaned data to a new csv file
output_path = 'data/995,000_row_cleaned_spaces.csv'
df_995.to_csv(output_path, index=False)
print(f"Cleaned the newline tabs spaces; data saved to {output_path}")

Pandas Apply: 100%|██████████| 250/250 [00:00<00:00, 297.40it/s]

Cleaned the newline tabs spaces; data saved to data/995,000_row_cleaned_spaces.csv





In [37]:
# Loading the new version of the Fake news dataset
df_995 = pd.read_csv('data/995,000_row_cleaned_spaces.csv')

In [38]:
# Dictionary of regex patterns to remove various date formats and numerical suffixes from text
# Each pattern is matched and replaced with an empty string to clean the text
patterns = {
    r"[\w]+ [\d]+, [\d]+": "",                     # DATE method 1: "January 15, 2023"
    r"[\d]+[\w]+ [\w]+ [\d]+": "",                 # DATE method 2: "15th January 2023" 
    r"[\d]+\/?-?\.?[\d]+\/?-?\.?[\d]+": "",        # DATE method 3: "01/15/2023", "01-15-2023", "01.15.2023"
    r"[\w]+ \d\d?[\w]?[\w]?,? [\d]{2,4}": "",      # DATE method 4: "January 15th, 2023", "Jan 1, 2023"
    r"([\d]{1,2}[\w]*) ([\w]*),? ([\d]{2,4})": "", # DATE method 5: "15 January, 2023", capturing day, month, year
    r"\b(\d+(st|nd|rd|th|s))\b": ""                # NUM: Ordinal numbers like "1st", "2nd", "3rd", "4th" or plural suffixes
}
# Creates a list of tuples containing compiled_regex and replacement_string
compiled_patterns = [(re.compile(pattern), replacement) for pattern, replacement in patterns.items()]

# Function that remove all date patterns from the text 
def remove_dates(txt):
    for pattern, replacement in compiled_patterns:
        txt = re.sub(pattern, replacement, txt)
    return txt

# Apply the date removal function to each row in the content column
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(remove_dates)

# Save the new version of cleaned data to a new csv file
output_path = 'data/995,000_row_remove_dates.csv'
df_995.to_csv(output_path, index=False)
print(f"Remove dates; data saved to {output_path}")


Pandas Apply: 100%|██████████| 250/250 [00:00<00:00, 1136.16it/s]

Remove dates; data saved to data/995,000_row_remove_dates.csv





In [39]:
# Loading the new version of the Fake news dataset
df_995 = pd.read_csv('data/995,000_row_remove_dates.csv') 

In [40]:
def remove_urls(txt):
    return clean(txt, 
        no_urls=True,                    # Remove all URLs from text
        no_emails=True,                  # Remove email addresses
        no_phone_numbers=True,           # Remove phone number patterns
        no_numbers=True,                 # Remove standalone numbers
        no_digits=True,                  # Remove individual digits
        no_currency_symbols=True,        # Remove currency symbols like $, €, etc.
        lower=False,                     # Preserve original case 
        no_punct=True,                   # Remove all punctuation
        replace_with_punct="",           # Replace punctuation with empty string
        replace_with_url="URL",          # Replace URLs with the token "URL"
        replace_with_email="EMAIL",      # Replace emails with the token "EMAIL"
        replace_with_phone_number="",    # Replace phone numbers with empty string
        replace_with_number="NUM",       # Replace numbers with the token "NUM"
        replace_with_digit="0",          # Replace digits with "0"
        replace_with_currency_symbol="", # Replace currency symbols with empty string
        lang="en"                        # Process as English language text
    )

# Apply text cleaning to remove specific elements from each document
# Using swifter to parallelize processing for better performance
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(remove_urls)

# Save the processed data to a new CSV file
output_path = 'data/995,000_row_removed_urls.csv'
print(f"Removed urls, emails, phone numbers, numbers, digits, currency symbols, punctuations; data saved to {output_path}")
df_995.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 250/250 [00:00<00:00, 273.56it/s]

Removed urls, emails, phone numbers, numbers, digits, currency symbols, punctuations; data saved to data/995,000_row_removed_urls.csv





In [41]:
# Loading the new version of the Fake news dataset
df_995 = pd.read_csv('data/995,000_row_removed_urls.csv')

In [42]:
# Break text content into individual tokens (words) using NLTK's word_tokenize function
# - First converts all content to string type 
# - Uses swifter to parallelize processing with a visible progress bar
# - Applies NLTK's word_tokenize which converts strings into lists of word tokens
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(word_tokenize)

# Save the tokenized data to a CSV file
output_path = 'data/995,000_row_tokenized.csv'
df_995.to_csv(output_path, index=False)
print(f"Tokenized; data saved to {output_path}")

Pandas Apply: 100%|██████████| 250/250 [00:00<00:00, 652.67it/s]


Tokenized; data saved to data/995,000_row_tokenized.csv


In [43]:
# Loading the new version of the Fake news dataset
df_995 = pd.read_csv('data/995,000_row_tokenized.csv')

In [44]:
# Create a set of English stop words using NLTK's stopwords corpus
stop_words = set(stopwords.words('english'))

# Function to filter out stop words from a list of tokens
def remove_stop_words(lst):
    # ast.literal_eval converts the string representation of a list back to an actual list
    # The list comprehension keeps only words that are not in the stop_words set
    return [word for word in ast.literal_eval(lst) if word.lower() not in stop_words]

# Remove stop words from each article in the Fake news dataset
df_995['content'] = df_995['content'].swifter.progress_bar(True).apply(remove_stop_words)

# Save the processed data with stop words removed to a CSV file
output_path = 'data/995,000_row_removed_stop_words.csv'
df_995.to_csv(output_path, index=False)
print(f"Row_removed_stop_words; data saved to {output_path}")


Pandas Apply: 100%|██████████| 250/250 [00:00<00:00, 737.54it/s]

Row_removed_stop_words; data saved to data/995,000_row_removed_stop_words.csv





In [45]:
# Loading the new version of the Fake news dataset
df_995 = pd.read_csv('data/995,000_row_removed_stop_words.csv')

In [None]:
# Initialize the Porter Stemming algorithm which reduces words to their root/base form
stemmer = PorterStemmer()
def stemming(lst):
    # ast.literal_eval safely converts the string representation of a list back to an actual list
    # Each word is reduced to its stem using the Porter algorithm
    return [stemmer.stem(word) for word in ast.literal_eval(lst)]

# Apply stemming to each article in the dataset
df_995['content'] = df_995['content'].swifter.progress_bar(True).apply(stemming)

# Save the fully processed and cleaned data to a CSV file
output_path = 'data/995,000_row_cleaned.csv'
df_995.to_csv(output_path, index=False)
print(f"Fully cleaned; data saved to {output_path}")

Pandas Apply: 100%|██████████| 250/250 [00:01<00:00, 185.63it/s]

Fully cleaned; data saved to data/995,000_row_cleaned1.csv



