In [None]:
import pandas as pd
import numpy as np
import re

INPUT_PATH = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\merged_raw_dataset.csv"
OUTPUT_PATH_FULL = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\final_cleaned_dataset.csv"
OUTPUT_PATH_2000 = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\final_cleaned_dataset_2000.csv"

print(" Loading dataset:", INPUT_PATH)
df = pd.read_csv(INPUT_PATH)
print("Initial shape:", df.shape)
print("Initial columns:", df.columns.tolist())


possible_text_cols = ["comment_text", "comment", "text", "review", "review_text", "comments"]
text_col = None
for col in df.columns:
    if col.lower() in possible_text_cols:
        text_col = col
        break
if text_col is None:
    text_col = df.columns[0]

print(" Using text column:", text_col)

possible_label_cols = ["stance_label", "sentiment", "label", "stance", "polarity"]
label_col = None
for col in df.columns:
    if col.lower() in possible_label_cols:
        label_col = col
        break

if label_col:
    print(" Using label column:", label_col)
else:
    print(" No label column found â€” creating empty one.")
    df["stance_label"] = np.nan
    label_col = "stance_label"

df = df.rename(columns={text_col: "comment_text", label_col: "stance_label"})


emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"
         u"\U0001F300-\U0001F5FF"
         u"\U0001F680-\U0001F6FF"
         u"\U0001F1E0-\U0001F1FF"
         "]+", flags=re.UNICODE)

def clean_text(x):
    if pd.isna(x):
        return ""
    x = str(x)

    x = re.sub(r"http\S+|www\S+", "", x)

    x = emoji_pattern.sub(r"", x)

    x = re.sub(r"[^\x00-\x7F]+"," ", x)

    x = re.sub(r"[^a-zA-Z0-9@#'\s]", " ", x)

    x = re.sub(r"\s+", " ", x).strip()

    return x.lower()

print(" Cleaning text...")
df["comment_text"] = df["comment_text"].apply(clean_text)


df = df[df["comment_text"].str.strip() != ""]
df = df.drop_duplicates(subset=["comment_text"]).reset_index(drop=True)

print("After cleaning:", df.shape)


def normalize_label(x):
    if pd.isna(x):
        return np.nan
    s = str(x).lower().strip()

    positive = ["for", "yes", "pro", "support", "positive", "agree"]
    negative = ["against", "no", "anti", "oppose", "negative", "disagree"]
    neutral = ["neutral", "none", "mixed", "undecided"]

    if s in positive:
        return "for"
    if s in negative:
        return "against"
    if s in neutral:
        return "neutral"

    # keyword inference
    if any(w in s for w in ["support", "good", "benefit", "positive"]):
        return "for"
    if any(w in s for w in ["oppose", "bad", "concern", "negative"]):
        return "against"

    return np.nan

print(" Normalizing labels...")
df["stance_label"] = df["stance_label"].apply(normalize_label)

# Remove unlabeled rows
df = df.dropna(subset=["stance_label"])
df = df[df["stance_label"].isin(["for", "against", "neutral"])]

print("After label cleaning:", df.shape)
print("Label distribution:\n", df["stance_label"].value_counts())


TARGET = 2000
current = len(df)

if current >= 2100:
    print(f"Sampling down from {current} â†’ {TARGET} (stratified)")
    df_2000 = df.groupby("stance_label", group_keys=False).apply(
        lambda x: x.sample(frac=TARGET/current, random_state=42)
    )
    df_2000 = df_2000.sample(n=TARGET, random_state=42).reset_index(drop=True)

elif current < 1800:
    print(f"Dataset too small ({current}), upsampling â†’ {TARGET}")
    needed = TARGET - current
    df_2000 = pd.concat([df, df.sample(n=needed, replace=True, random_state=42)])
    df_2000 = df_2000.reset_index(drop=True)

else:
    print(f"Dataset acceptable ({current}), sampling to exactly {TARGET}")
    df_2000 = df.sample(n=TARGET, random_state=42).reset_index(drop=True)

print("Final 2000 shape:", df_2000.shape)
print("Final distribution:\n", df_2000["stance_label"].value_counts())


df.to_csv(OUTPUT_PATH_FULL, index=False)
df_2000.to_csv(OUTPUT_PATH_2000, index=False)

print("\nðŸŽ‰ DONE!")
print("Full cleaned dataset saved as:", OUTPUT_PATH_FULL)
print("2000-row cleaned dataset saved as:", OUTPUT_PATH_2000)


 Loading dataset: C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\merged_raw_dataset.csv
Initial shape: (5373, 18)
Initial columns: ['file_name', 'comment_text', 'stance_label', 'word_count', 'policy_domain', 'app_name', 'package_name', 'rating', 'policy_id', 'policy_title', 'domain', 'state', 'district', 'source_type', 'source_name', 'respondent_type', 'sentiment_label', 'cleaned_comment_text']
 Using text column: comment_text
 Using label column: stance_label
 Cleaning text...
After cleaning: (5267, 18)
 Normalizing labels...
After label cleaning: (4774, 18)
Label distribution:
 stance_label
neutral    3725
against     539
for         510
Name: count, dtype: int64
Sampling down from 4774 â†’ 2000 (stratified)
Final 2000 shape: (2000, 18)
Final distribution:
 stance_label
neutral    1560
against     226
for         214
Name: count, dtype: int64

ðŸŽ‰ DONE!
Full cleaned dataset saved as: C:\Users\Ayush Ahlawat\OneDrive\Documents\Public 

  df_2000 = df.groupby("stance_label", group_keys=False).apply(
