In [1]:
import os 
import pandas as pd 
from bs4 import BeautifulSoup 
import html 

input_folder = "." 
output_folder = "cleaned_news"

# create the output folder if it doesn't exist 
os.makedirs(output_folder, exist_ok=True)

# Define the HTML cleaning function
def clean_html(raw_html):
    if pd.isna(raw_html):
        return ""

    decoded = html.unescape(raw_html)
    soup = BeautifulSoup(decoded, "html.parser")
    text = soup.get_text(separator="\n", strip=True)
    
    text = ' '.join(text.split())
    return text 


In [2]:
# Iterate through all news csv files in the folder
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith(".csv") and filename.startswith("news_"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)
        
        print(f"Processing {filename}")
        
        try: 
            # read csv
            df = pd.read_csv(input_path, encoding="utf-8")
            
            # apply cleaning to "originalContent"
            if 'originalContent' in df.columns:
                df['originalContent'] = df['originalContent'].apply(clean_html)
            else:
                print(f"⚠️ Skipping {filename}: 'originalContent' column not found.")
                continue
            # Save cleaned dataframe
            df.to_csv(output_path, index=False, encoding="utf-8")
            print(f"✅ Saved cleaned file to: {output_path}\n")
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}\n")

Processing news_2024-11.csv
✅ Saved cleaned file to: cleaned_news/news_2024-11.csv

Processing news_2024-12.csv


KeyboardInterrupt: 