In [1]:
!pip install pandas scikit-learn accelerate datasets 



In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report  # <-- fixed here
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings

In [30]:
warnings.filterwarnings("ignore")

In [31]:
df_orig = pd.read_csv(r"C:\Users\Educational Purpose\Downloads\posts_train.csv")
df_para = pd.read_csv(r"C:\Users\Educational Purpose\Downloads\posts_train_paraphrased_full.csv")

In [32]:
assert "post" in df_orig.columns and "class_name" in df_orig.columns
assert "post_paraphrased" in df_para.columns and "class_name" in df_para.columns

In [33]:
df_orig = df_orig.dropna(subset=["post", "class_name"]).reset_index(drop=True)
df_para = df_para.dropna(subset=["post_paraphrased", "class_name"]).reset_index(drop=True)

In [34]:
orig_counts = df_orig["class_name"].value_counts()
keep_orig = orig_counts[orig_counts > 1].index
df_orig = df_orig[df_orig["class_name"].isin(keep_orig)].reset_index(drop=True)

In [35]:
para_counts = df_para["class_name"].value_counts()
keep_para = para_counts[para_counts > 1].index
df_para = df_para[df_para["class_name"].isin(keep_para)].reset_index(drop=True)

In [36]:
print("Original shape after cleaning:", df_orig.shape)
print("Paraphrased shape after cleaning:", df_para.shape)

Original shape after cleaning: (13726, 26)
Paraphrased shape after cleaning: (13725, 27)


In [37]:
def get_model(name):
    if name == "Logistic Regression":
        return LogisticRegression(max_iter=1000)
    if name == "Naive Bayes":
        return MultinomialNB()
    if name == "SVM (LinearSVC)":
        return LinearSVC(max_iter=10000)
    if name == "Random Forest":
        return RandomForestClassifier(n_estimators=200, random_state=42)
    if name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
    raise ValueError(name)

model_names = ["Logistic Regression", "Naive Bayes", "SVM (LinearSVC)", "Random Forest", "XGBoost"]


In [38]:
def build_for_experiment(exp_num):
    if exp_num == 1:
        full = df_orig.copy()
        train_df, val_df = train_test_split(full, test_size=0.2, random_state=42, stratify=full["class_name"])
        return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

    if exp_num in (2, 3, 4):
        orig_train, orig_val = train_test_split(df_orig, test_size=0.2, random_state=42, stratify=df_orig["class_name"])
        frac = {2: 0.2, 3: 0.5, 4: 1.0}[exp_num]
        para_sample = df_para.sample(frac=frac, random_state=42).reset_index(drop=True)
        df_combined = pd.concat([
            orig_train[["post", "class_name"]],
            para_sample[["post_paraphrased", "class_name"]].rename(columns={"post_paraphrased": "post"})
        ], ignore_index=True)
        df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
        return df_combined, orig_val.reset_index(drop=True)

    if exp_num == 5:
        _, orig_val = train_test_split(df_orig, test_size=0.2, random_state=42, stratify=df_orig["class_name"])
        df_combined = pd.concat([
            df_orig[["post", "class_name"]],
            df_para[["post_paraphrased", "class_name"]].rename(columns={"post_paraphrased": "post"})
        ], ignore_index=True)
        df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
        return df_combined, orig_val.reset_index(drop=True)

    raise ValueError("Unknown experiment number")

In [39]:
results = []
detailed_reports = {}

for exp in range(1, 6):
    print("\n" + "="*80)
    print(f" Experiment {exp} - building data...")
    train_df, val_df = build_for_experiment(exp)

    train_df = train_df.dropna(subset=["post", "class_name"]).reset_index(drop=True)
    val_df = val_df.dropna(subset=["post", "class_name"]).reset_index(drop=True)

    print(f" train shape: {train_df.shape} | val shape: {val_df.shape}")
    print(" sample class counts (train -> val):")
    print(train_df["class_name"].value_counts().head())

    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
    X_train = vectorizer.fit_transform(train_df["post"])
    X_val = vectorizer.transform(val_df["post"])

    # Encode labels numerically
    le = LabelEncoder()
    le.fit(pd.concat([train_df["class_name"], val_df["class_name"]]))
    y_train_enc = le.transform(train_df["class_name"])
    y_val_enc = le.transform(val_df["class_name"])

    for mname in model_names:
        print(f"\n--- Training model: {mname} (Experiment {exp}) ---")
        model = get_model(mname)
        model.fit(X_train, y_train_enc)
        y_pred_enc = model.predict(X_val)
        y_pred = le.inverse_transform(y_pred_enc)

        # Compute metrics
        acc = accuracy_score(val_df["class_name"], y_pred)
        f1 = f1_score(val_df["class_name"], y_pred, average="macro")

        print(f"Accuracy: {acc:.4f} | F1 (macro): {f1:.4f}")

        # Store results
        results.append({
            "Experiment": exp,
            "Model": mname,
            "Accuracy": acc,
            "F1_macro": f1
        })

        detailed_reports[f"exp{exp}_{mname}"] = classification_report(
            val_df["class_name"], y_pred, zero_division=0
        )


 Experiment 1 - building data...
 train shape: (10980, 26) | val shape: (2746, 26)
 sample class counts (train -> val):
class_name
adhd          1972
depression    1960
anxiety       1937
bipolar       1925
ptsd          1601
Name: count, dtype: int64

--- Training model: Logistic Regression (Experiment 1) ---
Accuracy: 0.7622 | F1 (macro): 0.7666

--- Training model: Naive Bayes (Experiment 1) ---
Accuracy: 0.7272 | F1 (macro): 0.7353

--- Training model: SVM (LinearSVC) (Experiment 1) ---
Accuracy: 0.7447 | F1 (macro): 0.7505

--- Training model: Random Forest (Experiment 1) ---
Accuracy: 0.7334 | F1 (macro): 0.7392

--- Training model: XGBoost (Experiment 1) ---
Accuracy: 0.7535 | F1 (macro): 0.7589

 Experiment 2 - building data...
 train shape: (13725, 2) | val shape: (2746, 26)
 sample class counts (train -> val):
class_name
adhd          2449
depression    2440
bipolar       2435
anxiety       2428
ptsd          2016
Name: count, dtype: int64

--- Training model: Logistic Regre

In [40]:
results_df = pd.DataFrame(results)
pivot = results_df.pivot(index="Model", columns="Experiment", values="F1_macro").round(4)

print("\n" + "="*80)
print("Final F1 (macro) table (Model x Experiment):")
print(pivot)

results_df.to_csv("experiment_results.csv", index=False)
pivot.to_csv("experiment_results_pivot.csv", index=False)

with open("detailed_classification_reports.txt", "w", encoding="utf-8") as f:
    for k, v in detailed_reports.items():
        f.write(f"===== {k} =====\n{v}\n\n")

print("\nSaved:")
print(" - experiment_results.csv")
print(" - experiment_results_pivot.csv")
print(" - detailed_classification_reports.txt")


Final F1 (macro) table (Model x Experiment):
Experiment                1       2       3       4       5
Model                                                      
Logistic Regression  0.7666  0.7755  0.7812  0.7939  0.8667
Naive Bayes          0.7353  0.7424  0.7398  0.7429  0.7945
Random Forest        0.7392  0.7465  0.7576  0.7616  0.9993
SVM (LinearSVC)      0.7505  0.7599  0.7697  0.7880  0.9251
XGBoost              0.7589  0.7585  0.7687  0.7756  0.9056

Saved:
 - experiment_results.csv
 - experiment_results_pivot.csv
 - detailed_classification_reports.txt
