In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight

In [2]:
# 1) Load your original, imbalanced CSV
df = pd.read_csv("../Data/NLP/cleaned_news_sentiment.csv")

In [3]:
# 2) Split out each class
df_neg = df[df["sentiment"] == "negative"]
df_pos = df[df["sentiment"] == "positive"]
df_neu = df[df["sentiment"] == "neutral"]

In [4]:
# 3) Down‐sample 'neutral' to a target size (e.g. 80)
TARGET_NEUTRAL = 80
df_neu_down = df_neu.sample(n=TARGET_NEUTRAL, random_state=42)

In [5]:
# 4) Re-assemble your balanced DataFrame
df_bal = pd.concat([df_neg, df_pos, df_neu_down], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
# 5) Encode labels as integers
le = LabelEncoder()
df_bal["label"] = le.fit_transform(df_bal["sentiment"])

In [7]:
# 6) Compute class weights for use in model.fit(class_weight=...)
weights = compute_class_weight(
    class_weight="balanced",
    classes=le.classes_,
    y=df_bal["sentiment"]
)
class_weight_dict = dict(zip(range(len(weights)), weights))

In [8]:
# 7) Inspect the result
print("New class counts:\n", df_bal["sentiment"].value_counts(), "\n")
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print("Class‐weight dict:", class_weight_dict)

New class counts:
 sentiment
neutral     80
negative    56
positive    29
Name: count, dtype: int64 

Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Class‐weight dict: {0: 0.9821428571428571, 1: 0.6875, 2: 1.896551724137931}


In [9]:
# 8) (Optional) save balanced CSV for downstream steps
df_bal.to_csv("../Data/NLP/news_sentiment_balanced.csv", index=False, encoding="utf-8-sig")