In [24]:
import pandas as pd
url = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'
df = pd.read_csv(url)
print(df.head)

<bound method NDFrame.head of      Unnamed: 0     id                domain        type  \
0             0    141               awm.com  unreliable   
1             1    256     beforeitsnews.com        fake   
2             2    700           cnnnext.com  unreliable   
3             3    768               awm.com  unreliable   
4             4    791  bipartisanreport.com   clickbait   
..          ...    ...                   ...         ...   
245         245  39259     beforeitsnews.com        fake   
246         246  39468     beforeitsnews.com        fake   
247         247  39477       www.newsmax.com         NaN   
248         248  39550       www.newsmax.com         NaN   
249         249  39558       www.newsmax.com         NaN   

                                                   url  \
0    http://awm.com/church-congregation-brings-gift...   
1    http://beforeitsnews.com/awakening-start-here/...   
2    http://www.cnnnext.com/video/18526/never-hike-...   
3    http://awm.c

In [None]:
import re
import pandas as pd
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text: str) -> str:
    """
    Cleans a text string by replacing dates, emails, URLs, and numbers
    with placeholder tokens.
    """
    # Replace dates in formats like YYYY-MM-DD and DD-MM-YYYY
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', '<DATE>', text)
    text = re.sub(r'\b\d{2}-\d{2}-\d{4}\b', '<DATE>', text)
    # Replace email addresses
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)
    # Replace URLs (http/https)
    text = re.sub(r'(https?://\S+)', '<URL>', text)
    # Replace numbers (including decimals)
    text = re.sub(r'\b\d+(\.\d+)?\b', '<NUM>', text)
    
    return text.strip().lower()

def tokenize_text(text: str, stop_words: set) -> list:
    """
    Tokenizes the input text using NLTK and removes stopwords.
    """
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

def stem_tokens(tokens: list) -> list:
    """
    Applies Porter stemming to a list of tokens.
    """
    ps = PorterStemmer()
    return [ps.stem(token) for token in tokens]

def main():
    # Read the CSV file into a DataFrame and select the first 300 rows
    df = pd.read_csv("Datasets\\news_sample.csv", encoding="utf-8")
    df_sample = df.head(300)
    
    # Apply cleaning to every cell in the DataFrame
    df_clean = df_sample.applymap(lambda x: clean_text(str(x)))
    
    # Define the English stopwords set
    stop_words = set(stopwords.words('english'))
    
    # Tokenize each cell and remove stopwords, then join tokens back into a string
    df_tokens = df_clean.map(lambda x: " ".join(tokenize_text(x, stop_words)))
    
    # Save the tokenized data to CSV
    df_tokens.to_csv("block_stopwords_news_sample.csv", index=False)
    
    # Count word frequencies in the tokenized DataFrame
    all_tokens = []
    for cell in df_tokens.values.flatten():
        all_tokens.extend(cell.split())
    token_freq = Counter(all_tokens)
    
    # Count word frequencies in the original (uncleaned) sample
    all_original_words = []
    for cell in df_sample.map(lambda x: str(x).strip().lower()).values.flatten():
        all_original_words.extend(cell.split())
    original_freq = Counter(all_original_words)
    
    print(f"Word frequency in tokenized sample:\n{token_freq}\n")
    print(f"Word frequency in original sample:\n{original_freq}\n")
    
    # Apply stemming to each tokenized cell
    df_stem = df_tokens.map(lambda x: " ".join(stem_tokens(x.split())))
    df_stem.to_csv("stemmed_news_sample.csv", index=False)
    
    # Count word frequencies in the stemmed DataFrame
    all_stemmed_tokens = []
    for cell in df_stem.values.flatten():
        all_stemmed_tokens.extend(cell.split())
    stem_freq = Counter(all_stemmed_tokens)
    
    print(f"Word frequency in stemmed sample:\n{stem_freq}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_clean = df_sample.applymap(lambda x: clean_text(str(x)))
  df_tokens = df_clean.applymap(lambda x: " ".join(tokenize_text(x, stop_words)))
  for cell in df_sample.applymap(lambda x: str(x).strip().lower()).values.flatten():
  df_stem = df_tokens.applymap(lambda x: " ".join(stem_tokens(x.split())))


Word frequency in tokenized sample:

Word frequency in original sample:

Word frequency in stemmed sample:


We're using Pandas and NLTK because Pandas offers a robust data handling and cleaning capabilities, which is essential for large datasets. While Provides a suite of NLP tools (tokenization, stopwords, stemming) that are standard in text preprocessing. These operations help normalize the data for downstream analysis and modeling.

Pandas DataFrame: It’s flexible, easy to filter and analyze, and supports integration with visualization libraries.
Rationale: A DataFrame structure is ideal for handling large datasets and performing exploratory data analysis (EDA).
