In [1]:
import pandas as pd
import glob

# ------------------------------------
# UPDATE THIS PATH TO YOUR FOLDER
# ------------------------------------
DATASET_PATH = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset"   # folder where all CSVs are stored
OUTPUT_FILE = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\merged_raw_dataset.csv"

print(" Searching for CSV files...")
csv_files = glob.glob(DATASET_PATH + "/*.csv")

print(f" Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(" -", f)

merged_df = pd.DataFrame()

# ------------------------------------
# STANDARDIZE COLUMN NAMES
# ------------------------------------
def normalize_columns(df):
    df = df.rename(columns={
        'comment': 'comment_text',
        'comments': 'comment_text',
        'text': 'comment_text',
        'review_text': 'comment_text',
        'stance': 'stance_label',
        'label': 'stance_label',
        'sentiment': 'stance_label'
    })

    # Ensure essential columns exist
    if "comment_text" not in df.columns:
        df["comment_text"] = None

    if "stance_label" not in df.columns:
        df["stance_label"] = None

    return df


# ------------------------------------
# READ AND MERGE ALL FILES
# ------------------------------------
print("\n Merging files...")
for file in csv_files:
    try:
        df = pd.read_csv(file)
        df = normalize_columns(df)

        merged_df = pd.concat([merged_df, df], ignore_index=True)

    except Exception as e:
        print(f" Error reading {file}: {e}")

print("\n Removing duplicates...")
merged_df.drop_duplicates(subset=["comment_text"], inplace=True)

print(" Resetting index...")
merged_df.reset_index(drop=True, inplace=True)

# ------------------------------------
# SAVE MERGED FILE
# ------------------------------------
merged_df.to_csv(OUTPUT_FILE, index=False)

print("\n Merging Completed!")
print(f" Saved merged file as: {OUTPUT_FILE}")
print(f" Final Row Count: {len(merged_df)}")
print(" Columns:", list(merged_df.columns))


 Searching for CSV files...
 Found 7 CSV files:
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\pdf_extracted_comments.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\pdf_filtered_200.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\playstore_policy_reviews.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\policy_comment_dataset_500.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\policy_comment_dataset_500_cleaned.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\short_comments_dataset.csv
 - C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\synthetic_policy_comments_1200.csv

 Merging files...

 Removing duplic