In [107]:
import pandas as pd
import nltk
from cleantext.clean import clean
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



In [108]:
df = pd.read_csv('data/news_sample.csv')

In [109]:
def clean_text_library(text):
    return clean(str(text),  # Convert to string in case of non-string input
        fix_unicode=True,               
        to_ascii=True,                  
        lower=True,                     
        no_line_breaks=True,            
        lang="en"                       
    )
df['content'] = df['content'].astype(str).apply(clean_text_library)

# Save cleaned data
output_path = 'data/news_sample_1.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")


Cleaned data saved to data/news_sample_1.csv


In [110]:
# Regex patterns 
patterns = {
    r"[\w]+ [\d]+, [\d]+": "",                    # DATE method 1
    r"[\d]+[\w]+ [\w]+ [\d]+": "",                # DATE method 2
    r"[\d]+\/?-?\.?[\d]+\/?-?\.?[\d]+": "",       # DATE method 3 (fixed brackets)
    r"[\w]+ \d\d?[\w]?[\w]?,? [\d]{2,4}": "",     # DATE method 4
    r"([\d]{1,2}[\w]*) ([\w]*),? ([\d]{2,4})": "",# DATE method 5 (fixed range)
    r"\b(\d+(st|nd|rd|th|s))\b": ""                # NUM
}

def clean_dates(df):
    for pattern, replacement in patterns.items():
        df['content'] = df['content'].str.replace(pattern, replacement, regex=True)
    return df

df = clean_dates(df)
df.to_csv('data/news_sample_2.csv', index=False)

In [111]:
def clean_text_library(text):
    return clean(text,
    no_urls=True,                  
    no_emails=True,               
    no_phone_numbers=True,        
    no_numbers=True,               
    no_digits=True,                
    no_currency_symbols=True,      
    no_punct=True,                
    replace_with_punct="",          
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="0",
    replace_with_currency_symbol="",
    lang="en"                       
)
# Cleaning the content column of the dataset
df['content'] = df['content'].apply(clean_text_library)
df.to_csv('data/news_sample_3.csv', index=False)



In [112]:
# Cleaning the content column of the dataset
df['content'] = df['content'].apply(word_tokenize)
df.to_csv('data/news_sample_4.csv', index=False)



In [113]:
stop_words = set(stopwords.words('english'))

def rs(list):
    return [word for word in list if word.lower() not in stop_words]

df['content'] = df['content'].apply(rs)
df.to_csv('data/news_sample_5.csv', index=False)


In [114]:
stemmer = PorterStemmer()
def setmming(list):
    return [stemmer.stem(word) for word in list]

df['content'] = df['content'].apply(setmming)
df.to_csv('data/news_sample_6.csv', index=False)


In [117]:
# Defines the regex for finding words  
word_pattern = re.compile(r"\b\w+\b")

# Load the cleaned dataset
df_5 = pd.read_csv('data/news_sample_5.csv')
df_6 = pd.read_csv('data/news_sample_6.csv')
# Extracting all words of the "news_sample.csv" and the cleaned version
df_5_content = " ".join(df_5["content"].astype(str))
df_6_content = " ".join(df_6["content"].astype(str))

df_5_content = word_pattern.findall(df_5_content)
df_6_content = word_pattern.findall(df_6_content)

# Extracting all unique words of the "news_sample.csv" and the cleaned version
len_df_5_content = len(set(df_5_content))
len_df_6_content = len(set(df_6_content))

# Printing the unique words of the raw version, and the unique words after it has been cleaned 
print(f"Number of unique words in the df_5_content text: {len_df_5_content}")
print(f"Number of unique words in the df_6_content text: {len_df_6_content}")

print(len_df_6_content/len_df_5_content)

Number of unique words in the df_5_content text: 16371
Number of unique words in the df_6_content text: 10958
0.6693543460998106
