In [11]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report



df = pd.read_csv("data_to_be_cleansed.csv")

df = df.drop(columns=["Unnamed: 0"])



label_map = {
    0: "neutral",
    1: "depression",
    2: "anxiety",
    3: "stress",
    4: "anxiety"  
}

df["label"] = df["target"].map(label_map)

print("Label distribution:")
print(df["label"].value_counts())


def clean(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = (df["title"].astype(str) + " " + df["text"].astype(str)).apply(clean)


X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)



tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)



models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Linear SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB()
}

for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    print(classification_report(y_test, preds))


Label distribution:
label
anxiety       2373
depression    1202
stress        1201
neutral       1181
Name: count, dtype: int64

===== Logistic Regression =====
              precision    recall  f1-score   support

     anxiety       0.73      0.91      0.81       475
  depression       0.74      0.68      0.71       241
     neutral       0.93      0.69      0.79       236
      stress       0.85      0.73      0.78       240

    accuracy                           0.78      1192
   macro avg       0.81      0.75      0.77      1192
weighted avg       0.80      0.78      0.78      1192


===== Linear SVM =====
              precision    recall  f1-score   support

     anxiety       0.82      0.86      0.84       475
  depression       0.76      0.77      0.77       241
     neutral       0.86      0.80      0.83       236
      stress       0.81      0.78      0.80       240

    accuracy                           0.81      1192
   macro avg       0.81      0.80      0.81      1192
