In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,f1_score,confusion_matrix

In [None]:
data=pd.read_csv("/csv/Suicide_Detection.csv")

In [None]:
data.head()
data.shape
data = data.rename(columns={"text": "text", "class": "label"})
data = data[["text", "label"]].dropna()
data.isnull().sum()
print(data.label.value_counts())
# Remove exact duplicates
data = data.drop_duplicates(subset=["text"])

# Strip whitespace
data["text"] = data["text"].str.strip()

In [None]:
sns.countplot(x="label", data=data, palette="Set2")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
from wordcloud import WordCloud

for label in data["label"].unique():
    text_data = " ".join(data[data["label"] == label]["text"])
    wc = WordCloud(width=800, height=400, background_color="white").generate(text_data)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for class: {label}")
    plt.show()

In [None]:
data["text_length"] = data["text"].str.len()
plt.hist(data["text_length"], bins=50, edgecolor="black")
plt.title("Text Length Distribution")
plt.xlabel("Length of text")
plt.ylabel("Frequency")
plt.show()

In [None]:
from collections import Counter
all_words = " ".join(data["text"]).split()
counter = Counter(all_words)
common_words = counter.most_common(20)

words, counts = zip(*common_words)
plt.bar(words, counts)
plt.xticks(rotation=75)
plt.title("Top 20 Most Frequent Words")
plt.show()

In [None]:
vocab_per_class = {
    label: len(set(" ".join(data[data["label"]==label]["text"]).split()))
    for label in data["label"].unique()
}
plt.bar(vocab_per_class.keys(), vocab_per_class.values())
plt.title("Vocabulary Size per Class")
plt.show()

In [None]:
train_df, test_df = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data["label"]
)

print(train_df.label.value_counts(normalize=True))
print(test_df.label.value_counts(normalize=True))

In [None]:
def evaluate(y_true, y_pred, title=""):
    print(title)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average="macro"))
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Set2")
    plt.xlabel("Predicted"); plt.ylabel("True"); plt.show()

In [None]:
tfidf_svm = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),       
        min_df=2,                 # ignore rare tokens
        max_df=0.95,              # ignore overly common tokens
        sublinear_tf=True,        # log-scale term frequency
        strip_accents="unicode"   # normalize accents
    )),
    ("clf", LinearSVC(C=1.0))
])

tfidf_svm.fit(train_df.text, train_df.label)
pred_svm = tfidf_svm.predict(test_df.text)
evaluate(test_df.label, pred_svm, title="TF-IDF + LinearSVM")

In [None]:
tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True, strip_accents="unicode")),
    ("clf", LogisticRegression(max_iter=3000, C=2.0, class_weight="balanced"))
])

tfidf_nb = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95)),
    ("clf", MultinomialNB())
])

tfidf_lr.fit(train_df.text, train_df.label)
pred_lr = tfidf_lr.predict(test_df.text)
evaluate(test_df.label, pred_lr, title="TF-IDF + LogisticRegression")

tfidf_nb.fit(train_df.text, train_df.label)
pred_nb = tfidf_nb.predict(test_df.text)
evaluate(test_df.label, pred_nb, title="TF-IDF + MultinomialNB")

In [None]:
import joblib

# Save the pipeline (TF-IDF + LinearSVM)
joblib.dump(tfidf_svm, "tfidf_svm.pkl")
print("Model saved as tfidf_svm.pkl")