## Import necessary libraries and load data

In [1]:
import pandas as pd 
import numpy as np 
import json

## Clean and Process Fireant News

In [2]:
news_demo = pd.read_csv('fireant_data/news_2021-10.csv')
news_demo.head(1)

Unnamed: 0,postID,date,userid,username,title,description,newsType,postGroupName,postSourceName,postSourceUrl,originalContent,link,sentiment,totalLikes,totalReplies,totalShares,totalImages,replyToPostID,referToPostID,taggedSymbols
0,3751931,2021-10-31T21:22:00+07:00,266ed7a4-0c22-4683-86d3-dfe616343731,Mister Mạnh,"Lâm sản, thủy sản nỗ lực hồi phục sau giãn cách",Ngành nông nghiệp trong tháng 10/2021 chứng ki...,,Kinh tế,{VnEconomy - Nguồn không hợp lệ},https://vneconomy.vn/,<p><strong>Ng&agrave;nh n&ocirc;ng nghiệp tron...,,0,7,6,0,1,,,[]


## Helper Functions

In [None]:
from bs4 import BeautifulSoup
import html 
import os



# Clean HTML content from text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()
    # Unescape HTML entities
    cleaned_text = html.unescape(cleaned_text)
    return cleaned_text

# Extract symbols from taggedSymbols column
def extract_symbols(tagged_symbols_str):
    # if tagged_symbols_str is a list skip processing
    if isinstance(tagged_symbols_str, list):
        return tagged_symbols_str
    if pd.isna(tagged_symbols_str) or tagged_symbols_str.strip() == "":
        return []
    try:
        data = json.loads(tagged_symbols_str)
        # handle both single dict and list of dicts
        if isinstance(data, dict):
            return [data.get("symbol")]
        elif isinstance(data, list):
            return [item.get("symbol") for item in data if isinstance(item, dict) and "symbol" in item]
        else:
            return []
    except json.JSONDecodeError:
        # In case malformed JSON (e.g. missing brackets)
        return []

# Combine multiple fields into a single text field
def combine_content(row):
    # combine title, description, originalContent, sentiment, totalLikes, totalReplies, totalShares into on text field 
    """  
    format:
    Title: {title}
    Description: {description}
    Content: {originalContent}
    Sentiment: {sentiment} # -1 for negative, 0 for neutral, 1 for positive
    Likes: {totalLikes}
    Replies: {totalReplies}
    Shares: {totalShares}
    """
    content_parts = []
    if pd.notna(row['title']):
        content_parts.append(f"Title: {row['title']}")
    if pd.notna(row['description']):
        content_parts.append(f"Description: {row['description']}")
    if pd.notna(row['originalContent']):
        content_parts.append(f"Content: {row['originalContent']}")
        
    # sentiment mapping
    sentiment_map = {-1: "negative", 0: "neutral", 1: "positive"}
    sentiment_str = sentiment_map.get(row['sentiment'], "unknown")
    content_parts.append(f"Sentiment: {sentiment_str}")
    
    content_parts.append(f"Likes: {row['totalLikes']}")
    content_parts.append(f"Replies: {row['totalReplies']}")
    content_parts.append(f"Shares: {row['totalShares']}")
    return "\n".join(content_parts)



In [11]:
# function clean and preprocess fireant news data
def clean_fireant_news(news_paths):
    for path in news_paths:
        # Check if path exists else continue
        try:
            with open(path, 'r', encoding='utf-8', errors="replace") as file:
                news_df = pd.read_csv(
                    file,
                    header=0,
                    quotechar='"',
                    escapechar='\\',
                    encoding='utf-8',
                    on_bad_lines='skip')
                print(f"Processing file: {path} with {len(news_df)} records.")
                
                print(f"Initial number of records: {len(news_df)}")
                
                # Convert and clean numeric columns
                # --- 1. Drop invalid postID rows (anything not all digits) ---
                news_df["postID"] = news_df["postID"].astype(str)
                news_df = news_df[news_df["postID"].str.fullmatch(r"\d+")]

                # --- 2. Convert sentiment to numeric and keep only -1,0,1 ---
                news_df["sentiment"] = pd.to_numeric(news_df["sentiment"], errors="coerce")
                news_df = news_df[news_df["sentiment"].isin([-1,0,1])]
                
                # --- 3. Convert counts to integers ---
                for col in ["totalLikes", "totalReplies", "totalShares"]:
                    news_df[col] = pd.to_numeric(news_df[col], errors="coerce").fillna(0).astype(int)
                    news_df = news_df[news_df[col] >= 0]
                    
             
                # --- 4. Drop rows with missing essential text fields ---
                news_df = news_df.dropna(subset=["title", "description", "originalContent"])
                
                
                # --- 5. Process date column --- (ex: 2021-10-31T21:22:00+07:00 to 2021-10-31)
                news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
                news_df = news_df.dropna(subset=['date'])

                
                # --- 6. Clean HTML fields ---
                text_fields = ["title", "description", "originalContent"]
                for field in text_fields:
                    news_df[field] = news_df[field].apply(clean_html)
                
                # --- 7. Extract taggedSymbols ---
                news_df['taggedSymbols'] = news_df['taggedSymbols'].apply(extract_symbols)
                news_df["taggedSymbols"] = news_df["taggedSymbols"].apply(json.dumps)
                
                # --- 8. Combine content ---
                news_df['combinedContent'] = news_df.apply(combine_content, axis=1)
                
                # --- 9. Drop unnecessary columns ---
                columns_to_drop = ['userid', 'username', 'title', 'description',
                                   'originalContent', 'postGroupName', 'postSourceName', 'postSourceUrl',
                                   'link', 'sentiment', 'totalLikes', 'totalReplies', 'totalShares', 'totalImages', 
                                   'replyToPostID',	'referToPostID']
                news_df = news_df.drop(columns=columns_to_drop, errors='ignore')
                
                
                print(f"Final number of records after cleaning: {len(news_df)}")
                
                
                # --- 10. Save cleaned CSV ---
                folder_path = "fireant_data/cleaned_news"
                if not os.path.exists(folder_path):
                    os.makedirs(folder_path)
                # Save to cleaned_news folder with same filename, example: fireant_data/cleaned_news/news_2021-10.csv
                cleaned_path = os.path.join(folder_path, os.path.basename(path))
                news_df.to_csv(cleaned_path, index=False, encoding='utf-8')
                print(f"Cleaned data saved to: {cleaned_path}\n")
                print("--------------------------------------------------")
                                   
                
        except FileNotFoundError:
            print(f"File not found: {path}. Skipping.")
            continue



In [14]:
def clean_fireant_news(news_paths):
    combined_df = []  # list to store all cleaned monthly DataFrames

    for path in news_paths:
        try:
            with open(path, 'r', encoding='utf-8', errors="replace") as file:
                news_df = pd.read_csv(
                    file,
                    header=0,
                    quotechar='"',
                    escapechar='\\',
                    encoding='utf-8',
                    on_bad_lines='skip'
                )
                print(f"Processing file: {path} with {len(news_df)} records.")

                # --- 1. Drop invalid postID rows (anything not all digits) ---
                news_df["postID"] = news_df["postID"].astype(str)
                news_df = news_df[news_df["postID"].str.fullmatch(r"\d+")]

                # --- 2. Convert sentiment to numeric and keep only -1,0,1 ---
                news_df["sentiment"] = pd.to_numeric(news_df["sentiment"], errors="coerce")
                news_df = news_df[news_df["sentiment"].isin([-1,0,1])]

                # --- 3. Convert counts to integers ---
                for col in ["totalLikes", "totalReplies", "totalShares"]:
                    news_df[col] = pd.to_numeric(news_df[col], errors="coerce").fillna(0).astype(int)
                    news_df = news_df[news_df[col] >= 0]

                # --- 4. Drop rows with missing essential text fields ---
                news_df = news_df.dropna(subset=["title", "description", "originalContent"])

                # --- 5. Process date column ---
                news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
                news_df = news_df.dropna(subset=['date'])

                # --- 6. Clean HTML fields ---
                text_fields = ["title", "description", "originalContent"]
                for field in text_fields:
                    news_df[field] = news_df[field].apply(clean_html)

                # --- 7. Extract taggedSymbols ---
                news_df['taggedSymbols'] = news_df['taggedSymbols'].apply(extract_symbols)
                news_df["taggedSymbols"] = news_df["taggedSymbols"].apply(json.dumps)

                # --- 8. Combine content ---
                news_df['combinedContent'] = news_df.apply(combine_content, axis=1)

                # --- 9. Drop unnecessary columns ---
                columns_to_drop = ['userid', 'username', 'title', 'description',
                                   'originalContent', 'postGroupName', 'postSourceName', 'postSourceUrl',
                                   'link', 'sentiment', 'totalLikes', 'totalReplies', 'totalShares', 'totalImages', 
                                   'replyToPostID', 'referToPostID']
                news_df = news_df.drop(columns=columns_to_drop, errors='ignore')

                print(f"Final number of records after cleaning: {len(news_df)}")

                # Append cleaned monthly DF to list
                combined_df.append(news_df)

        except FileNotFoundError:
            print(f"File not found: {path}. Skipping.")
            continue

    # --- Combine all months into one DataFrame ---
    if combined_df:
        final_df = pd.concat(combined_df, ignore_index=True)
        print(f"Total combined records: {len(final_df)}")

        # --- Save combined CSV ---
        folder_path = "fireant_data/cleaned_news"
        os.makedirs(folder_path, exist_ok=True)
        combined_path = os.path.join(folder_path, "all_news.csv")
        final_df.to_csv(combined_path, index=False, encoding='utf-8')
        print(f"Combined cleaned data saved to: {combined_path}")
    else:
        print("No data processed. Combined CSV not created.")


In [15]:
years = [2021, 2022, 2023, 2024, 2025]
news_paths = [f"fireant_data/news_{year}-{month:02d}.csv" for year in years for month in range(1, 13)]
clean_fireant_news(news_paths)

File not found: fireant_data/news_2021-01.csv. Skipping.
File not found: fireant_data/news_2021-02.csv. Skipping.
File not found: fireant_data/news_2021-03.csv. Skipping.
File not found: fireant_data/news_2021-04.csv. Skipping.
File not found: fireant_data/news_2021-05.csv. Skipping.
File not found: fireant_data/news_2021-06.csv. Skipping.
File not found: fireant_data/news_2021-07.csv. Skipping.
File not found: fireant_data/news_2021-08.csv. Skipping.
Processing file: fireant_data/news_2021-09.csv with 3063 records.
Final number of records after cleaning: 2957
Processing file: fireant_data/news_2021-10.csv with 3093 records.
Final number of records after cleaning: 3029
Processing file: fireant_data/news_2021-11.csv with 3190 records.
Final number of records after cleaning: 2967
Processing file: fireant_data/news_2021-12.csv with 2994 records.
Final number of records after cleaning: 2653
Processing file: fireant_data/news_2022-01.csv with 2648 records.
Final number of records after clea


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, "html.parser")


Final number of records after cleaning: 2594
Processing file: fireant_data/news_2023-01.csv with 3927 records.
Final number of records after cleaning: 3556
Processing file: fireant_data/news_2023-02.csv with 4231 records.
Final number of records after cleaning: 3671
Processing file: fireant_data/news_2023-03.csv with 4281 records.
Final number of records after cleaning: 3669
Processing file: fireant_data/news_2023-04.csv with 4893 records.
Final number of records after cleaning: 4415
Processing file: fireant_data/news_2023-05.csv with 5133 records.
Final number of records after cleaning: 4796
Processing file: fireant_data/news_2023-06.csv with 4447 records.
Final number of records after cleaning: 3813
Processing file: fireant_data/news_2023-07.csv with 4459 records.
Final number of records after cleaning: 3600
Processing file: fireant_data/news_2023-08.csv with 5416 records.
Final number of records after cleaning: 4709
Processing file: fireant_data/news_2023-09.csv with 5162 records.
F