In [3]:
import pandas as pd
import nltk
from cleantext.clean import clean
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
import swifter
import ast


Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tqdm.pandas()

In [5]:
df = pd.read_csv('data/news_sample.csv')

In [None]:
def clean_text_library(text):
    return clean(str(text),  # Convert to string in case of non-string input
        fix_unicode=True,               
        to_ascii=True,                  
        lower=True,                     
        no_line_breaks=True,            
        lang="en"                       
    )
df['content'] = df['content'].astype(str).progress_apply(clean_text_library)


# Save cleaned data
output_path = 'data/news_sample_1.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")


In [None]:
# Regex patterns 
patterns = {
    r"[\w]+ [\d]+, [\d]+": "",                    # DATE method 1
    r"[\d]+[\w]+ [\w]+ [\d]+": "",                # DATE method 2
    r"[\d]+\/?-?\.?[\d]+\/?-?\.?[\d]+": "",       # DATE method 3 (fixed brackets)
    r"[\w]+ \d\d?[\w]?[\w]?,? [\d]{2,4}": "",     # DATE method 4
    r"([\d]{1,2}[\w]*) ([\w]*),? ([\d]{2,4})": "",# DATE method 5 (fixed range)
    r"\b(\d+(st|nd|rd|th|s))\b": ""                # NUM
}

def clean_dates(df):
    for pattern, replacement in patterns.items():
        df['content'] = df['content'].str.replace(pattern, replacement, regex=True)
    return df

df = clean_dates(df)
df.to_csv('data/news_sample_2.csv', index=False)

In [None]:
def clean_text_library(text):
    return clean(text,
    no_urls=True,
    lower=False,                   
    no_emails=True,               
    no_phone_numbers=True,        
    no_numbers=True,               
    no_digits=True,                
    no_currency_symbols=True,      
    no_punct=True,                
    replace_with_punct="",          
    replace_with_url="URL",
    replace_with_email="EMAIL",
    replace_with_phone_number="",
    replace_with_number="NUM",
    replace_with_digit="0",
    replace_with_currency_symbol="",
    lang="en"                       
)
# Cleaning the content column of the dataset
df['content'] = df['content'].progress_apply(clean_text_library)
df.to_csv('data/news_sample_3.csv', index=False)


In [None]:
# Cleaning the content column of the dataset
df['content'] = df['content'].apply(word_tokenize)
df.to_csv('data/news_sample_4.csv', index=False)

In [None]:
stop_words = set(stopwords.words('english'))

def rs(list):
    return [word for word in list if word.lower() not in stop_words]

df['content'] = df['content'].apply(rs)
df.to_csv('data/news_sample_5.csv', index=False)

In [None]:
stemmer = PorterStemmer()
def setmming(list):
    return [stemmer.stem(word) for word in list]

df['content'] = df['content'].apply(setmming)
df.to_csv('data/news_sample_6.csv', index=False)

In [51]:
# Defines the regex for finding words  
word_pattern = re.compile(r"\b\w+\b")

# Load the cleaned dataset
df_5 = pd.read_csv('data/news_sample_5.csv')
df_6 = pd.read_csv('data/news_sample_6.csv')
# Extracting all words of the "news_sample.csv" and the cleaned version
df_5_content = " ".join(df_5["content"].astype(str))
df_6_content = " ".join(df_6["content"].astype(str))

df_5_content = word_pattern.findall(df_5_content)
df_6_content = word_pattern.findall(df_6_content)

# Extracting all unique words of the "news_sample.csv" and the cleaned version
len_df_5_content = len(set(df_5_content))
len_df_6_content = len(set(df_6_content))

# Printing the unique words of the raw version, and the unique words after it has been cleaned 
print(f"Number of unique words in the df_5_content text: {len_df_5_content}")
print(f"Number of unique words in the df_6_content text: {len_df_6_content}")

print(len_df_6_content/len_df_5_content)

Number of unique words in the df_5_content text: 16371
Number of unique words in the df_6_content text: 10958
0.6693543460998106


In [6]:
df_995 = pd.read_csv('data/995,000_rows.csv')

  df_995 = pd.read_csv('data/995,000_rows.csv')


In [None]:
def clean_space(txt):
    return clean(str(txt), fix_unicode=True, to_ascii=True, lower=True, no_line_breaks=True,lang="en")

# Clean the text remove newline tabs spaces
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(clean_space)

# Save cleaned data
output_path = 'data/995,000_row_cleaned_spaces.csv'
df_995.to_csv(output_path, index=False)
print(f"Cleaned the newline tabs spaces; data saved to {output_path}")

In [None]:
df_995 = pd.read_csv('data/995,000_row_cleaned_spaces.csv')

In [None]:
patterns = {
    r"[\w]+ [\d]+, [\d]+": "",                     # DATE method 1
    r"[\d]+[\w]+ [\w]+ [\d]+": "",                 # DATE method 2
    r"[\d]+\/?-?\.?[\d]+\/?-?\.?[\d]+": "",        # DATE method 3 (fixed brackets)
    r"[\w]+ \d\d?[\w]?[\w]?,? [\d]{2,4}": "",      # DATE method 4
    r"([\d]{1,2}[\w]*) ([\w]*),? ([\d]{2,4})": "", # DATE method 5 (fixed range)
    r"\b(\d+(st|nd|rd|th|s))\b": ""                # NUM
}
compiled_patterns = [(re.compile(pattern), replacement) for pattern, replacement in patterns.items()]

def remove_dates(txt):
    for pattern, replacement in compiled_patterns:
        txt = re.sub(pattern, replacement, txt)
    return txt

# Remove dates
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(remove_dates)

# Save cleaned data
output_path = 'data/995,000_row_remove_dates.csv'
df_995.to_csv(output_path, index=False)
print(f"Remove dates; data saved to {output_path}")


Pandas Apply: 100%|██████████| 995000/995000 [04:58<00:00, 3337.23it/s]


Remove dates; data saved to data/995,000_row_remove_dates.csv


In [None]:
df_995 = pd.read_csv('data/995,000_row_remove_dates.csv') 

In [None]:
def remove_urls(txt):
    return clean(txt, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_digits=True, no_currency_symbols=True, lower=False, no_punct=True, replace_with_punct="", replace_with_url="URL", replace_with_email="EMAIL", replace_with_phone_number="", replace_with_number="NUM", replace_with_digit="0", replace_with_currency_symbol="", lang="en")

# Remove urls, emails, phone numbers, numbers, digits, currency symbols, punctuations
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(remove_urls)

# Save cleaned data
output_path = 'data/995,000_row_removed_urls.csv'
print(f"Removed urls, emails, phone numbers, numbers, digits, currency symbols, punctuations; data saved to {output_path}")
df_995.to_csv(output_path, index=False)

In [None]:
df_995 = pd.read_csv('data/995,000_row_removed_urls.csv')

In [None]:

# Tokenize the text
df_995['content'] = df_995['content'].astype(str).swifter.progress_bar(True).apply(word_tokenize)

# Save cleaned data
output_path = 'data/995,000_row_tokenized.csv'
df_995.to_csv(output_path, index=False)
print(f"Tokenized; data saved to {output_path}")

In [None]:
df_995 = pd.read_csv('data/995,000_row_tokenized.csv')

In [None]:

stop_words = set(stopwords.words('english'))
def remove_stop_words(lst):
    return [word for word in ast.literal_eval(lst) if word.lower() not in stop_words]

# Remove stop words
df_995['content'] = df_995['content'].swifter.progress_bar(True).apply(remove_stop_words)

# Save cleaned data
output_path = 'data/995,000_row_removed_stop_words.csv'
df_995.to_csv(output_path, index=False)
print(f"Row_removed_stop_words; data saved to {output_path}")


In [None]:
df_995 = pd.read_csv('data/995,000_row_removed_stop_words.csv')

In [None]:
stemmer = PorterStemmer()
def stemming(lst):
    return [stemmer.stem(word) for word in ast.literal_eval(lst)]

#  Stem the text
df_995['content'] = df_995['content'].swifter.progress_bar(True).apply(stemming)

# Save cleaned data
output_path = 'data/995,000_row_cleaned.csv'
df_995.to_csv(output_path, index=False)
print(f"Fully cleaned; data saved to {output_path}")

In [4]:
df_995 = pd.read_csv('data/995,000_row_cleaned.csv')

  df_995 = pd.read_csv('data/995,000_row_cleaned.csv')


In [6]:
print(df_995['type'].unique())

['political' 'fake' 'satire' 'reliable' 'conspiracy' 'unreliable' 'bias'
 'rumor' 'unknown' nan 'clickbait' 'hate' 'junksci'
 '2018-02-10 13:43:39.521661']


Fake = fake, satire, conspiracy, unreliable, bias, rumor, hate, junksci
Reliable = reliable, clickbait, political
useless = unknown, nan, 2018-02-10 13:43:39.521661