## Import necessary libraries and load data

In [1]:
import pandas as pd 
import numpy as np 
import json

## Clean and Process Fireant News

In [2]:
news_demo = pd.read_csv('fireant_data/news_2021-10.csv')
news_demo.head(1)

Unnamed: 0,postID,date,userid,username,title,description,newsType,postGroupName,postSourceName,postSourceUrl,originalContent,link,sentiment,totalLikes,totalReplies,totalShares,totalImages,replyToPostID,referToPostID,taggedSymbols
0,3751931,2021-10-31T21:22:00+07:00,266ed7a4-0c22-4683-86d3-dfe616343731,Mister M·∫°nh,"L√¢m s·∫£n, th·ªßy s·∫£n n·ªó l·ª±c h·ªìi ph·ª•c sau gi√£n c√°ch",Ng√†nh n√¥ng nghi·ªáp trong th√°ng 10/2021 ch·ª©ng ki...,,Kinh t·∫ø,{VnEconomy - Ngu·ªìn kh√¥ng h·ª£p l·ªá},https://vneconomy.vn/,<p><strong>Ng&agrave;nh n&ocirc;ng nghi·ªáp tron...,,0,7,6,0,1,,,[]


## Helper Functions

In [2]:
from bs4 import BeautifulSoup
import html 
import os



# Clean HTML content from text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()
    # Unescape HTML entities
    cleaned_text = html.unescape(cleaned_text)
    return cleaned_text

# Extract symbols from taggedSymbols column
def extract_symbols(tagged_symbols_str):
    # if tagged_symbols_str is a list skip processing
    if isinstance(tagged_symbols_str, list):
        return tagged_symbols_str
    if pd.isna(tagged_symbols_str) or tagged_symbols_str.strip() == "":
        return []
    try:
        data = json.loads(tagged_symbols_str)
        # handle both single dict and list of dicts
        if isinstance(data, dict):
            return [data.get("symbol")]
        elif isinstance(data, list):
            return [item.get("symbol") for item in data if isinstance(item, dict) and "symbol" in item]
        else:
            return []
    except json.JSONDecodeError:
        # In case malformed JSON (e.g. missing brackets)
        return []

# Combine multiple fields into a single text field
def combine_content(row):
    # combine title, description, originalContent, sentiment, totalLikes, totalReplies, totalShares into on text field 
    """  
    format:
    Title: {title}
    Description: {description}
    Content: {originalContent}
    """
    content_parts = []
    if pd.notna(row['title']):
        content_parts.append(f"Title: {row['title']}")
    if pd.notna(row['description']):
        content_parts.append(f"Description: {row['description']}")
    if pd.notna(row['originalContent']):
        content_parts.append(f"Content: {row['originalContent']}")
        
    return "\n".join(content_parts)



In [3]:
def clean_fireant_news(news_paths):
    combined_df = []  # list to store all cleaned monthly DataFrames

    for path in news_paths:
        try:
            with open(path, 'r', encoding='utf-8', errors="replace") as file:
                news_df = pd.read_csv(
                    file,
                    header=0,
                    quotechar='"',
                    escapechar='\\',
                    encoding='utf-8',
                    on_bad_lines='skip'
                )
                print(f"Processing file: {path} with {len(news_df)} records.")

                # --- 1. Drop invalid postID rows (anything not all digits) ---
                news_df["postID"] = news_df["postID"].astype(str)
                news_df = news_df[news_df["postID"].str.fullmatch(r"\d+")]

                # --- 2. Convert sentiment to numeric and keep only -1,0,1 ---
                news_df["sentiment"] = pd.to_numeric(news_df["sentiment"], errors="coerce")
                news_df = news_df[news_df["sentiment"].isin([-1,0,1])]

                # --- 3. Convert counts to integers ---
                for col in ["totalLikes", "totalReplies", "totalShares"]:
                    news_df[col] = pd.to_numeric(news_df[col], errors="coerce").fillna(0).astype(int)
                    news_df = news_df[news_df[col] >= 0]

                # --- 4. Drop rows with missing essential text fields ---
                news_df = news_df.dropna(subset=["title", "description", "originalContent"])

                # --- 5. Process date column ---
                news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
                news_df = news_df.dropna(subset=['date'])

                # --- 6. Clean HTML fields ---
                text_fields = ["title", "description", "originalContent"]
                for field in text_fields:
                    news_df[field] = news_df[field].apply(clean_html)

                # --- 7. Extract taggedSymbols ---
                news_df['taggedSymbols'] = news_df['taggedSymbols'].apply(extract_symbols)
                news_df["taggedSymbols"] = news_df["taggedSymbols"].apply(json.dumps)

                # --- 8. Combine content ---
                news_df['combinedContent'] = news_df.apply(combine_content, axis=1)

                # --- 9. Drop unnecessary columns ---
                columns_to_drop = ['userid', 'username', 'title', 'description',
                                   'originalContent', 'postGroupName', 'postSourceName', 'postSourceUrl',
                                   'link', 'totalLikes', 'totalReplies', 'totalShares', 'totalImages', 
                                   'replyToPostID', 'referToPostID', 'newsType']
                news_df = news_df.drop(columns=columns_to_drop, errors='ignore')

                print(f"Final number of records after cleaning: {len(news_df)}")

                # Append cleaned monthly DF to list
                combined_df.append(news_df)

        except FileNotFoundError:
            print(f"File not found: {path}. Skipping.")
            continue

    # --- Combine all months into one DataFrame ---
    if combined_df:
        final_df = pd.concat(combined_df, ignore_index=True)
        print(f"Total combined records: {len(final_df)}")

        # --- Save combined CSV ---
        folder_path = "fireant_data/cleaned_news"
        os.makedirs(folder_path, exist_ok=True)
        combined_path = os.path.join(folder_path, "all_news.csv")
        final_df.to_csv(combined_path, index=False, encoding='utf-8')
        print(f"Combined cleaned data saved to: {combined_path}")
    else:
        print("No data processed. Combined CSV not created.")


In [4]:
years = [2021, 2022, 2023, 2024, 2025]
news_paths = [f"fireant_data/news_{year}-{month:02d}.csv" for year in years for month in range(1, 13)]
clean_fireant_news(news_paths)

File not found: fireant_data/news_2021-01.csv. Skipping.
File not found: fireant_data/news_2021-02.csv. Skipping.
File not found: fireant_data/news_2021-03.csv. Skipping.
File not found: fireant_data/news_2021-04.csv. Skipping.
File not found: fireant_data/news_2021-05.csv. Skipping.
File not found: fireant_data/news_2021-06.csv. Skipping.
File not found: fireant_data/news_2021-07.csv. Skipping.
File not found: fireant_data/news_2021-08.csv. Skipping.
Processing file: fireant_data/news_2021-09.csv with 3063 records.
Final number of records after cleaning: 2957
Processing file: fireant_data/news_2021-10.csv with 3093 records.
Final number of records after cleaning: 3029
Processing file: fireant_data/news_2021-11.csv with 3190 records.
Final number of records after cleaning: 2967
Processing file: fireant_data/news_2021-12.csv with 2994 records.
Final number of records after cleaning: 2653
Processing file: fireant_data/news_2022-01.csv with 2648 records.
Final number of records after clea


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, "html.parser")


Final number of records after cleaning: 2594
Processing file: fireant_data/news_2023-01.csv with 3927 records.
Final number of records after cleaning: 3556
Processing file: fireant_data/news_2023-02.csv with 4231 records.
Final number of records after cleaning: 3671
Processing file: fireant_data/news_2023-03.csv with 4281 records.
Final number of records after cleaning: 3669
Processing file: fireant_data/news_2023-04.csv with 4893 records.
Final number of records after cleaning: 4415
Processing file: fireant_data/news_2023-05.csv with 5133 records.
Final number of records after cleaning: 4796
Processing file: fireant_data/news_2023-06.csv with 4447 records.
Final number of records after cleaning: 3813
Processing file: fireant_data/news_2023-07.csv with 4459 records.
Final number of records after cleaning: 3600
Processing file: fireant_data/news_2023-08.csv with 5416 records.
Final number of records after cleaning: 4709
Processing file: fireant_data/news_2023-09.csv with 5162 records.
F

## Clean and Process Fireant Posts

In [4]:
posts_demo = pd.read_csv('fireant_data/posts_2021-09.csv')
posts_demo.head(1)

Unnamed: 0,postID,originalContent,date,link,sentiment,totalLikes,totalReplies,replyToPostID,taggedSymbols,username,userid,totalImages,totalFiles,totalSymbols
0,3118126,V√†o NKG ng√†y mai ·ªïn kh√¥ng c√°c b√°c? D√†i h·∫°n v√†o...,2021-09-30T23:59:27.933+07:00,,0,1,7,,"[{""symb"": ""NKG"", ""price"": 44.25}]",Uy Lam,2546bf0a-a617-4198-a48b-95799149e3e9,0,0,1


In [3]:
# Extract symbols from taggedSymbols column
def extract_symbols(tagged_symbols_str):
    # if tagged_symbols_str is a list skip processing
    if isinstance(tagged_symbols_str, list):
        return tagged_symbols_str
    if pd.isna(tagged_symbols_str) or tagged_symbols_str.strip() == "":
        return []
    try:
        data = json.loads(tagged_symbols_str)
        # handle both single dict and list of dicts
        if isinstance(data, dict):
            return [data.get("symb")]
        elif isinstance(data, list):
            return [item.get("symb") for item in data if isinstance(item, dict) and "symb" in item]
        else:
            return []
    except json.JSONDecodeError:
        # In case malformed JSON (e.g. missing brackets)
        return []

def clean_fireant_posts(posts_paths):
    combined_df = [] 
    for path in posts_paths:
        try:
            with open(path, 'r', encoding='utf-8', errors="replace") as file:
                posts_df = pd.read_csv(
                    file,
                    header=0,
                    quotechar='"',
                    escapechar='\\',
                    encoding='utf-8',
                    on_bad_lines='skip'
                )
            print(f"Processing file: {path} with {len(posts_df)} records.")

            # --- 1. Drop invalid postID rows (anything not all digits) ---
            posts_df["postID"] = posts_df["postID"].astype(str)
            posts_df = posts_df[posts_df["postID"].str.fullmatch(r"\d+")]

            # --- 2. Convert Sentiment into Integers and Keep only -1, 0 and 1 ---
            posts_df["sentiment"] = pd.to_numeric(posts_df["sentiment"], errors="coerce")
            posts_df = posts_df[posts_df["sentiment"].isin([-1,0, 1])]
            
            # --- 3. Convert counts to integers ---
            for col in ["totalLikes", "totalReplies"]:
                posts_df[col] = pd.to_numeric(posts_df[col], errors="coerce").fillna(0).astype(int)
                posts_df = posts_df[posts_df[col] >= 0]
                
            # --- 4. Drop rows with missing essential text field ---
            posts_df = posts_df.dropna(subset=["originalContent"])
            
            # -- 5. Process date column ---
            posts_df['date'] = pd.to_datetime(posts_df['date'], errors='coerce').dt.date
            posts_df = posts_df.dropna(subset=['date']) 
            
            # --- 6. Clean HTML field ---
            posts_df["originalContent"] = posts_df["originalContent"].apply(clean_html)
            
            # --- 7. Extract taggedSymbols --- 
            posts_df['taggedSymbols'] = posts_df['taggedSymbols'].apply(extract_symbols) 
            
            # --- 8. Remove unnecessary columns ---
            columns_to_drop = ['userid', 'username', 'link','totalLikes', 'totalReplies',
                               'totalImages', 'totalFiles', 'totalSymbols','replyToPostID']
            posts_df = posts_df.drop(columns=columns_to_drop, errors='ignore')
            
            print(f"Final number of records after cleaning: {len(posts_df)}")
            combined_df.append(posts_df)

            
                    
        except FileNotFoundError:
            print(f"File not found: {path}. Skipping.")
            continue
               
    # --- Combine all months into one DataFrame ---
    if combined_df:
        final_df = pd.concat(combined_df, ignore_index=True)
        print(f"Total combined records: {len(final_df)}")

        # --- Save combined CSV ---
        folder_path = "fireant_data/cleaned_posts"
        os.makedirs(folder_path, exist_ok=True)
        combined_path = os.path.join(folder_path, "all_posts.csv")
        final_df.to_csv(combined_path, index=False, encoding='utf-8')
        print(f"Combined cleaned data saved to: {combined_path}")
    else:
        print("No data processed. Combined CSV not created.")


In [4]:
from bs4 import MarkupResemblesLocatorWarning
import warnings

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
years = [2021, 2022, 2023, 2024, 2025]
posts_paths = [f"fireant_data/posts_{year}-{month:02d}.csv" for year in years for month in range(1, 13)]
clean_fireant_posts(posts_paths)

File not found: fireant_data/posts_2021-01.csv. Skipping.
File not found: fireant_data/posts_2021-02.csv. Skipping.
File not found: fireant_data/posts_2021-03.csv. Skipping.
File not found: fireant_data/posts_2021-04.csv. Skipping.
File not found: fireant_data/posts_2021-05.csv. Skipping.
File not found: fireant_data/posts_2021-06.csv. Skipping.
File not found: fireant_data/posts_2021-07.csv. Skipping.
File not found: fireant_data/posts_2021-08.csv. Skipping.


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2021-09.csv with 73692 records.
Final number of records after cleaning: 73436


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2021-10.csv with 86249 records.
Final number of records after cleaning: 85955


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2021-11.csv with 134665 records.
Final number of records after cleaning: 134190


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2021-12.csv with 151956 records.
Final number of records after cleaning: 151415


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-01.csv with 159575 records.
Final number of records after cleaning: 159049


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-02.csv with 86841 records.
Final number of records after cleaning: 86558
Processing file: fireant_data/posts_2022-03.csv with 184904 records.
Final number of records after cleaning: 184297


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-04.csv with 166609 records.
Final number of records after cleaning: 166068


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-05.csv with 105618 records.
Final number of records after cleaning: 105268
Processing file: fireant_data/posts_2022-06.csv with 124357 records.
Final number of records after cleaning: 123956


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-07.csv with 99499 records.
Final number of records after cleaning: 99169
Processing file: fireant_data/posts_2022-08.csv with 126177 records.
Final number of records after cleaning: 125776


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-09.csv with 107588 records.
Final number of records after cleaning: 107227


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-10.csv with 111901 records.
Final number of records after cleaning: 111492


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-11.csv with 123471 records.
Final number of records after cleaning: 123012


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2022-12.csv with 109689 records.
Final number of records after cleaning: 109320
Processing file: fireant_data/posts_2023-01.csv with 52104 records.
Final number of records after cleaning: 51913
Processing file: fireant_data/posts_2023-02.csv with 76275 records.
Final number of records after cleaning: 76026


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-03.csv with 90744 records.
Final number of records after cleaning: 90480
Processing file: fireant_data/posts_2023-04.csv with 100145 records.
Final number of records after cleaning: 99788
Processing file: fireant_data/posts_2023-05.csv with 106798 records.
Final number of records after cleaning: 106493


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-06.csv with 142500 records.
Final number of records after cleaning: 142015


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-07.csv with 145775 records.
Final number of records after cleaning: 145303


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-08.csv with 186690 records.
Final number of records after cleaning: 186067


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-09.csv with 155964 records.
Final number of records after cleaning: 155481


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-10.csv with 137400 records.
Final number of records after cleaning: 136921
Processing file: fireant_data/posts_2023-11.csv with 123678 records.
Final number of records after cleaning: 123251


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2023-12.csv with 115601 records.
Final number of records after cleaning: 115182


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-01.csv with 119242 records.
Final number of records after cleaning: 118856


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-02.csv with 86797 records.
Final number of records after cleaning: 86497


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-03.csv with 161364 records.
Final number of records after cleaning: 160840


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-04.csv with 142865 records.
Final number of records after cleaning: 142392


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-05.csv with 153532 records.
Final number of records after cleaning: 153035


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-06.csv with 150711 records.
Final number of records after cleaning: 150207
Processing file: fireant_data/posts_2024-07.csv with 161921 records.
Final number of records after cleaning: 161379


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-08.csv with 141940 records.
Final number of records after cleaning: 141495


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-09.csv with 101867 records.
Final number of records after cleaning: 101543


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-10.csv with 129297 records.
Final number of records after cleaning: 128869
Processing file: fireant_data/posts_2024-11.csv with 115747 records.
Final number of records after cleaning: 115389


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2024-12.csv with 116866 records.
Final number of records after cleaning: 116464
Processing file: fireant_data/posts_2025-01.csv with 83573 records.
Final number of records after cleaning: 83320
Processing file: fireant_data/posts_2025-02.csv with 108925 records.
Final number of records after cleaning: 108570


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-03.csv with 142666 records.
Final number of records after cleaning: 142180


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-04.csv with 154434 records.
Final number of records after cleaning: 153967


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-05.csv with 137546 records.
Final number of records after cleaning: 137070


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-06.csv with 144209 records.
Final number of records after cleaning: 143723


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-07.csv with 222259 records.
Final number of records after cleaning: 221497


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-08.csv with 235771 records.
Final number of records after cleaning: 234980


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-09.csv with 167774 records.
Final number of records after cleaning: 167211


  posts_df = pd.read_csv(


Processing file: fireant_data/posts_2025-10.csv with 214327 records.
Final number of records after cleaning: 213672
File not found: fireant_data/posts_2025-11.csv. Skipping.
File not found: fireant_data/posts_2025-12.csv. Skipping.
Total combined records: 6558264
Combined cleaned data saved to: fireant_data/cleaned_posts\all_posts.csv


## Clean and Process Fireant Replies

In [9]:
replies_demo = pd.read_csv('fireant_data/replies_2021-09.csv')
replies_demo.head(1)

Unnamed: 0,postID,originalContent,date,link,sentiment,totalLikes,totalReplies,replyToPostID,taggedSymbols,username,userid,totalImages,totalFiles,totalSymbols
0,3118042,√¥i\nth·∫ø mai b√°n h·∫øt th√¥i\nhihiü•≤,2021-09-30T23:36:32.243+07:00,,0,2,0,3117869,[],Ho√†ng Anh,eeb2fe9a-ada0-4333-96c5-599b00058a13,0,0,0


In [13]:
# functions for cleaning and processing replies 
def clean_replies(replies_paths):
    combined_df = [] 
    for path in replies_paths:
        try:
            with open(path, 'r', encoding='utf-8', errors="replace") as file:
                replies_df = pd.read_csv(
                    file,
                    header=0,
                    quotechar='"',
                    escapechar='\\',
                    encoding='utf-8',
                    on_bad_lines='skip'
                )
            print(f"Processing file: {path} with {len(replies_df)} records.")
            # --- 1. Drop invalid postID rows (anything not all digits) ---
            replies_df["postID"] = replies_df["postID"].astype(str)
            replies_df = replies_df[replies_df["postID"].str.fullmatch(r"\d+")]
            # --- 2. Clean HTML field ---
            replies_df["originalContent"] = replies_df["originalContent"].apply(clean_html)
            # --- 3. Remove unnecessary columns ---
            columns_to_drop = ['userid', 'username', 'link','totalLikes', 'totalReplies',
                               'totalImages', 'totalFiles', 'taggedSymbols', 'totalSymbols']
            replies_df = replies_df.drop(columns=columns_to_drop, errors='ignore')
            
            print(f"Final number of records after cleaning: {len(replies_df)}")
        except FileNotFoundError:
            print(f"File not found: {path}. Skipping.")
            continue
        
        combined_df.append(replies_df)
    # --- Combine all months into one DataFrame ---
    if combined_df:
        final_df = pd.concat(combined_df, ignore_index=True)
        print(f"Total combined records: {len(final_df)}")

        # --- Save combined CSV ---
        folder_path = "fireant_data/cleaned_replies"
        os.makedirs(folder_path, exist_ok=True)
        combined_path = os.path.join(folder_path, "all_replies.csv")
        final_df.to_csv(combined_path, index=False, encoding='utf-8')
        print(f"Combined cleaned data saved to: {combined_path}")
    else:
        print("No data processed. Combined CSV not created.")

In [15]:
# Replies paths 
years = [2021, 2022, 2023, 2024, 2025]
replies_paths = [f"fireant_data/replies_{year}-{month:02d}.csv" for year in years for month in range(1, 13)]
clean_replies(replies_paths)

File not found: fireant_data/replies_2021-01.csv. Skipping.
File not found: fireant_data/replies_2021-02.csv. Skipping.
File not found: fireant_data/replies_2021-03.csv. Skipping.
File not found: fireant_data/replies_2021-04.csv. Skipping.
File not found: fireant_data/replies_2021-05.csv. Skipping.
File not found: fireant_data/replies_2021-06.csv. Skipping.
File not found: fireant_data/replies_2021-07.csv. Skipping.
File not found: fireant_data/replies_2021-08.csv. Skipping.
Processing file: fireant_data/replies_2021-09.csv with 61516 records.
Final number of records after cleaning: 61515


  replies_df = pd.read_csv(


Processing file: fireant_data/replies_2021-10.csv with 71592 records.
Final number of records after cleaning: 71591
Processing file: fireant_data/replies_2021-11.csv with 88785 records.
Final number of records after cleaning: 88785


  replies_df = pd.read_csv(


Processing file: fireant_data/replies_2021-12.csv with 87947 records.
Final number of records after cleaning: 87946


  replies_df = pd.read_csv(


Processing file: fireant_data/replies_2022-01.csv with 103466 records.
Final number of records after cleaning: 103464
Processing file: fireant_data/replies_2022-02.csv with 45480 records.
Final number of records after cleaning: 45479


  replies_df = pd.read_csv(


Processing file: fireant_data/replies_2022-03.csv with 79876 records.
Final number of records after cleaning: 79875


  replies_df = pd.read_csv(


Processing file: fireant_data/replies_2022-04.csv with 75384 records.
Final number of records after cleaning: 75383
Processing file: fireant_data/replies_2022-05.csv with 40252 records.
Final number of records after cleaning: 40251
Processing file: fireant_data/replies_2022-06.csv with 46271 records.
Final number of records after cleaning: 46270
Processing file: fireant_data/replies_2022-07.csv with 41219 records.
Final number of records after cleaning: 41218
Processing file: fireant_data/replies_2022-08.csv with 47296 records.
Final number of records after cleaning: 47295
Processing file: fireant_data/replies_2022-09.csv with 43488 records.
Final number of records after cleaning: 43487
Processing file: fireant_data/replies_2022-10.csv with 42693 records.
Final number of records after cleaning: 42692
Processing file: fireant_data/replies_2022-11.csv with 38251 records.
Final number of records after cleaning: 38251
Processing file: fireant_data/replies_2022-12.csv with 37852 records.
Fi