In [1]:
!pip install pandas scikit-learn accelerate datasets



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings


In [10]:
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv(r"C:\Users\Educational Purpose\Downloads\posts_test.csv")

In [19]:
assert "post" in df.columns and "class_name" in df.columns, \
"posts_test.csv must have 'post' and 'class_name' columns."


In [20]:
df = df.dropna(subset=["post", "class_name"]).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")


Dataset shape: (1488, 4)


In [22]:
train_df, test_df = train_test_split(df, test_size=0.2,

random_state=42, stratify=df["class_name"])

In [24]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train = vectorizer.fit_transform(train_df["post"])
X_test = vectorizer.transform(test_df["post"])

le = LabelEncoder()
le.fit(df["class_name"])
y_train_enc = le.transform(train_df["class_name"])
y_test_enc = le.transform(test_df["class_name"])


In [25]:
def get_model(name):
    if name == "Logistic Regression":
        return LogisticRegression(max_iter=1000)
    if name == "Naive Bayes":
        return MultinomialNB()
    if name == "SVM (LinearSVC)":
        return LinearSVC(max_iter=10000)
    if name == "Random Forest":
        return RandomForestClassifier(n_estimators=200, random_state=42)
    if name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
    raise ValueError(name)

model_names = ["Logistic Regression", "Naive Bayes", "SVM (LinearSVC)", "Random Forest", "XGBoost"]


In [26]:
results = []

for mname in model_names:
    print(f"\n--- Training model: {mname} ---")
    model = get_model(mname)
    model.fit(X_train, y_train_enc)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test_enc, y_pred)
    f1 = f1_score(y_test_enc, y_pred, average="macro")

    print(f"Accuracy: {acc:.4f} | F1 (macro): {f1:.4f}")
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

    results.append({"Model": mname, "Accuracy": acc, "F1_macro": f1})



--- Training model: Logistic Regression ---
Accuracy: 0.6711 | F1 (macro): 0.6679
              precision    recall  f1-score   support

        adhd       0.80      0.67      0.73        49
     anxiety       0.72      0.69      0.71        49
     bipolar       0.62      0.62      0.62        50
  depression       0.51      0.46      0.48        50
        none       0.66      0.92      0.77        50
        ptsd       0.73      0.66      0.69        50

    accuracy                           0.67       298
   macro avg       0.67      0.67      0.67       298
weighted avg       0.67      0.67      0.67       298


--- Training model: Naive Bayes ---
Accuracy: 0.6275 | F1 (macro): 0.6312
              precision    recall  f1-score   support

        adhd       0.78      0.63      0.70        49
     anxiety       0.57      0.55      0.56        49
     bipolar       0.60      0.70      0.65        50
  depression       0.51      0.48      0.49        50
        none       0.94     

In [29]:
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

results_df.to_csv(r"C:\Users\Educational Purpose\Downloads\results_only_test.csv", index=False)
print("\nSaved results to results_only_test.csv")



Final Results:
                 Model  Accuracy  F1_macro
0  Logistic Regression  0.671141  0.667880
1          Naive Bayes  0.627517  0.631163
2      SVM (LinearSVC)  0.674497  0.670725
3        Random Forest  0.644295  0.644478
4              XGBoost  0.590604  0.585720

Saved results to results_only_test.csv
