<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/Banglish_Depression_classifier_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Upload Code

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Banglish depression dataset.csv to Banglish depression dataset.csv


#Logistic regression

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle, resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack


file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, header=None, names=["Category", "Sentence"])
df.dropna(subset=["Sentence", "Category"], inplace=True)
df["Category"] = df["Category"].str.strip()
df = shuffle(df, random_state=42)

extra_data = [
    {'Sentence': 'ami ajke bajare jabo', 'Category': 'No Depression'},
    {'Sentence': 'ami office e jabo', 'Category': 'No Depression'},
    {'Sentence': 'ajke weather ta nice', 'Category': 'No Depression'},
    {'Sentence': 'ami valo achi', 'Category': 'No Depression'},
    {'Sentence': 'shobai kemon acho', 'Category': 'No Depression'},
    {'Sentence': 'alhamdulillah bhalo achi', 'Category': 'No Depression'},
    {'Sentence': 'ami suicide korbo', 'Category': 'Severe'},
    {'Sentence': 'ami ar bachbo na', 'Category': 'Severe'},
    {'Sentence': 'goodbye earth', 'Category': 'Severe'},
    {'Sentence': 'amar life ta khub kharap', 'Category': 'Mild'},
    {'Sentence': 'valolage na kichu', 'Category': 'Mild'},
    {'Sentence': 'sobai keno chole jay', 'Category': 'Mild'},
    {'Sentence': 'ami ar kichu korte parbo na', 'Category': 'No Depression'},
    {'Sentence': 'ami khub stressed feel kortesi', 'Category': 'Mild'}
]

df = pd.concat([df, pd.DataFrame(extra_data)], ignore_index=True)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Zআ-হ0-9\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

df["Cleaned"] = df["Sentence"].apply(clean_text)

positive_words = ['valo', 'bhalo', 'happy', 'alhamdulillah', 'nice']
negative_words = ['kharap', 'na', 'tired', 'stress', 'sad', 'suicide', 'khub']

def count_words(text, word_list):
    return sum(text.count(w) for w in word_list)

df["sent_len"] = df["Cleaned"].apply(lambda x: len(x.split()))
df["pos_count"] = df["Cleaned"].apply(lambda x: count_words(x, positive_words))
df["neg_count"] = df["Cleaned"].apply(lambda x: count_words(x, negative_words))

classes = df["Category"].unique()
max_size = df["Category"].value_counts().max()

df_balanced = pd.concat([
    resample(df[df["Category"] == cls], replace=True, n_samples=max_size, random_state=42)
    for cls in classes
])

df_balanced = shuffle(df_balanced, random_state=42)

X_text = df_balanced["Cleaned"]
X_num = df_balanced[["sent_len", "pos_count", "neg_count"]].values
y = df_balanced["Category"]

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

X_train_text, X_test_text, y_train, y_test, X_train_num, X_test_num = train_test_split(
    X_text, y, X_num, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,4))
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

model = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='sag', C=30, random_state=42)
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall    = recall_score(y_test, y_pred, average="macro")
f1        = f1_score(y_test, y_pred, average="macro")

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)



def predict_live(text):
    clean = clean_text(text)
    tfidf_vec = vectorizer.transform([clean])
    sent_len = len(clean.split())
    pos_count = count_words(clean, positive_words)
    neg_count = count_words(clean, negative_words)
    num_feat = scaler.transform([[sent_len, pos_count, neg_count]])
    combined = hstack([tfidf_vec, num_feat])
    pred = model.predict(combined)[0]
    prob = np.max(model.predict_proba(combined)) * 100
    return pred, prob

while True:
    txt = input("Enter text for prediction (or type 'exit' to quit): ").strip()
    if txt.lower() == 'exit':
        break
    if txt:
        pred, conf = predict_live(txt)
        print(f"Prediction: {pred} | Confidence: {conf:.2f}%")




Accuracy : 0.939639079029247
Precision: 0.9395592511554691
Recall   : 0.9396006253024156
F1-score : 0.9395654216734438
Enter text for prediction (or type 'exit' to quit): exit


#LSTM

In [None]:
!pip install emoji
!pip install gensim
!pip install tensorflow

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m18.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api
import nltk
nltk.download('stopwords')

file_name = "Banglish depression dataset.csv"
df = pd.read_csv(file_name)
df.columns = ["Category", "Sentence"]
df = df.dropna(subset=['Sentence', 'Category'])

eng_stop = set(stopwords.words("english"))
bn_stop = {"ami","tumi","amra","valo","kharap","ache","achhi","kintu","na","ar","shob",
           "ekta","kore","shudhu","amar","tumar","jibone","mone","kotha","ki","kemon",
           "tome","tomar","tara","tarao","taraor","je","sei","ei","oka","ora"}
stop_words = eng_stop.union(bn_stop)

positive_words = {'moja','happy','bhalo','fun','sundor','friend','party','mojar'}
negative_words = {'dukho','kharaap','niras','lonely','stress','dukhi','depressed'}

tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text):
    text = str(text).lower()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\u0980-\u09FF ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return tokens

def augment_text(tokens):
    if len(tokens) > 1:
        idx = np.random.randint(0, len(tokens))
        tokens.insert(idx, tokens[idx])
    return tokens

aug_sentences = []
aug_labels = []
for sentence, label in zip(df['Sentence'], df['Category']):
    tokens = clean_text(sentence)
    aug_sentences.append(tokens)
    aug_labels.append(label)
    aug_sentences.append(augment_text(tokens.copy()))
    aug_labels.append(label)

df_aug = pd.DataFrame({"Category": aug_labels, "tokens": aug_sentences})
df_aug = shuffle(df_aug, random_state=42)

ft_model = api.load("fasttext-wiki-news-subwords-300")
embedding_dim = ft_model.vector_size

word_index = {word: idx+1 for idx, word in enumerate(ft_model.key_to_index)}
vocab_size = len(word_index) + 1

def tokens_to_sequence(tokens):
    return [word_index[t] for t in tokens if t in word_index]

df_aug['seq'] = df_aug['tokens'].apply(tokens_to_sequence)
max_len = 50
X = pad_sequences(df_aug['seq'], maxlen=max_len)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word_index.items():
    try:
        embedding_matrix[idx] = ft_model.get_vector(word)
    except KeyError:
        continue

encoder = LabelEncoder()
y = encoder.fit_transform(df_aug['Category'])
y_cat = to_categorical(y)
num_classes = y_cat.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                    weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(128))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.1, verbose=1)

y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
recall = recall_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)

print("✨ Model Evaluation Results ✨")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

def predict_sentence_with_confidence(sentence):
    tokens = clean_text(sentence)
    if len(set(tokens) & positive_words) > 0:
        return "Non-depression", 1.0
    seq = tokens_to_sequence(tokens)
    padded = pad_sequences([seq], maxlen=max_len)
    pred = model.predict(padded)
    class_idx = np.argmax(pred)
    class_label = encoder.inverse_transform([class_idx])[0]
    confidence = pred[0][class_idx]
    return class_label, confidence

while True:
    sentence = input("Enter a Banglish sentence (or type 'exit' to quit): ")
    if sentence.lower() == 'exit':
        break
    prediction, conf = predict_sentence_with_confidence(sentence)
    print(f"Prediction: {prediction} | Confidence: {conf:.2f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.






Epoch 1/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 38ms/step - accuracy: 0.6056 - loss: 0.7901 - val_accuracy: 0.7992 - val_loss: 0.4690
Epoch 2/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8007 - loss: 0.4937 - val_accuracy: 0.8210 - val_loss: 0.4286
Epoch 3/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8144 - loss: 0.4404 - val_accuracy: 0.8325 - val_loss: 0.4387
Epoch 4/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8282 - loss: 0.4167 - val_accuracy: 0.8325 - val_loss: 0.3917
Epoch 5/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8422 - loss: 0.3797 - val_accuracy: 0.8450 - val_loss: 0.3737
Epoch 6/15
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8532 - loss: 0.3528 - val_accuracy: 0.8574 - val_loss: 0.3495
Epoch 7/15
[1m2

#ANN MLP

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

df = pd.read_csv("Banglish depression dataset.csv")
df.dropna(subset=['Sentence', 'Category'], inplace=True)

stopwords_eng = set(stopwords.words('english'))
stopwords_bangla = {'ami','tumi','shei','amra','eto','kemon','achho','aschi','na'}
all_stopwords = stopwords_eng.union(stopwords_bangla)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return tokens

df['Tokens'] = df['Sentence'].apply(clean_text)

le = LabelEncoder()
df['Label'] = le.fit_transform(df['Category'])
num_classes = len(le.classes_)
y = to_categorical(df['Label'], num_classes=num_classes)

ft_model = api.load('fasttext-wiki-news-subwords-300')
embedding_dim = ft_model.vector_size

def sentence_to_vec(tokens, model, dim):
    vecs = []
    for word in tokens:
        if word in model:
            vecs.append(model[word])
    if len(vecs) > 0:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

X = np.array([sentence_to_vec(tokens, ft_model, embedding_dim) for tokens in df['Tokens']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = Sequential()
model.add(Dense(128, input_dim=embedding_dim, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_split=0.1, epochs=30, batch_size=32, verbose=2)

# --- Evaluation ---
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
recall = recall_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)

print("\n✨ Model Evaluation Results ✨")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

def predict_depression(text):
    tokens = clean_text(text)
    vec = sentence_to_vec(tokens, ft_model, embedding_dim).reshape(1, -1)
    pred = model.predict(vec)
    pred_class = np.argmax(pred, axis=1)[0]
    confidence = np.max(pred)
    return le.inverse_transform([pred_class])[0], confidence

while True:
    sentence = input("\nEnter text (or 'exit'): ")
    if sentence.lower() == 'exit':
        break
    category, conf = predict_depression(sentence)
    print(f"Predicted Category: {category} | Confidence: {conf:.2f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


136/136 - 1s - 7ms/step - accuracy: 0.5691 - loss: 0.8477 - val_accuracy: 0.6923 - val_loss: 0.7005
Epoch 2/30
136/136 - 0s - 2ms/step - accuracy: 0.6950 - loss: 0.6539 - val_accuracy: 0.7360 - val_loss: 0.6037
Epoch 3/30
136/136 - 0s - 2ms/step - accuracy: 0.7531 - loss: 0.5678 - val_accuracy: 0.7734 - val_loss: 0.5460
Epoch 4/30
136/136 - 0s - 2ms/step - accuracy: 0.7767 - loss: 0.5289 - val_accuracy: 0.7672 - val_loss: 0.5456
Epoch 5/30
136/136 - 0s - 2ms/step - accuracy: 0.7936 - loss: 0.4937 - val_accuracy: 0.7796 - val_loss: 0.5279
Epoch 6/30
136/136 - 0s - 2ms/step - accuracy: 0.7894 - loss: 0.4815 - val_accuracy: 0.7817 - val_loss: 0.5107
Epoch 7/30
136/136 - 0s - 2ms/step - accuracy: 0.8054 - loss: 0.4583 - val_accuracy: 0.7755 - val_loss: 0.5136
Epoch 8/30
136/136 - 0s - 2ms/step - accuracy: 0.8112 - loss: 0.4566 - val_accuracy: 0.7775 - val_loss: 0.5420
Epoch 9/30
136/136 - 0s - 2ms/step - accuracy: 0.8070 - loss: 0.4592 - val_accuracy: 0.7879 - val_loss: 0.5084
Epoch 10/30


#Random forest


In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import shuffle

file_path = 'Banglish depression dataset.csv'
df = pd.read_csv(file_path)
df.columns = ["Category", "Sentence"]
df = df.dropna(subset=['Sentence', 'Category'])

eng_stop = {"i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now"}
bn_stop = {"ami","tumi","amra","valo","kharap","ache","achhi","kintu","na","ar","shob","ekta","kore","shudhu","amar","tumar","jibone","mone","kotha","ki","kemon","tome","tomar","tara","tarao","taraor","je","sei","ei","oka","ora"}
stop_words = eng_stop.union(bn_stop)

def preprocess_text(text):
    text = str(text).lower()
    text = emoji.demojize(text, delimiters=(" "," "))
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^a-zA-Z\u0980-\u09FF ]+", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(tokens)

df['Cleaned_Sentence'] = df['Sentence'].apply(preprocess_text)

positive_words = {'moja','happy','bhalo','fun','sundor','friend','party','mojar'}
negative_words = {'dukho','kharaap','niras','lonely','stress','dukhi','depressed','chinta','udasin'}

def augment_text(text):
    tokens = text.split()
    new_tokens = tokens.copy()
    if len(tokens) > 1:
        idx = np.random.randint(0, len(tokens))
        new_tokens.insert(idx, tokens[idx])
    return " ".join(new_tokens)

aug_sentences = []
aug_labels = []
for sentence, label in zip(df['Cleaned_Sentence'], df['Category']):
    aug_sentences.append(sentence)
    aug_labels.append(label)
    for _ in range(1):
        aug_sentences.append(augment_text(sentence))
        aug_labels.append(label)

df_aug = pd.DataFrame({"Category": aug_labels, "Cleaned_Sentence": aug_sentences})
df_aug = shuffle(df_aug, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df_aug['Cleaned_Sentence'])
y = df_aug['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

def predict_text(text):
    cleaned = preprocess_text(text)
    vec = vectorizer.transform([cleaned])
    pred = rf_model.predict(vec)[0]
    pred_prob = max(rf_model.predict_proba(vec)[0])
    tokens = set(cleaned.split())
    if len(tokens & positive_words) > 0:
        pred = 'Non-depression'
    if len(tokens & negative_words) > 0 and pred != 'Non-depression':
        pred_prob = min(1.0, pred_prob + 0.1)
    return pred, pred_prob

while True:
    text_input = input("Enter a Banglish sentence (or type 'exit' to quit): ")
    if text_input.lower() == 'exit':
        break
    if text_input.strip() == "":
        continue
    prediction, confidence = predict_text(text_input)
    print(f"Predicted Category: {prediction}")
    print(f"Confidence Score:   {confidence:.2f}")
    print()


Accuracy:  0.9592
Precision: 0.9594
Recall:    0.9592
F1 Score:  0.9592
Enter a Banglish sentence (or type 'exit' to quit): exit


#SVM

In [None]:
import pandas as pd
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
import numpy as np

df = pd.read_csv("Banglish depression dataset.csv")
df.dropna(subset=['Sentence', 'Category'], inplace=True)

stopwords_eng = {...}
stopwords_bangla = {...}
all_stopwords = stopwords_eng.union(stopwords_bangla)

positive_words = {'moja','happy','bhalo','fun','sundor','friend','party','mojar'}
negative_words = {'dukho','kharaap','niras','lonely','stress','dukhi','depressed'}

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return " ".join(tokens)

df['clean_text'] = df['Sentence'].apply(clean_text)

def sentiment_features(text):
    tokens = set(text.split())
    pos_count = len(tokens & positive_words)
    neg_count = len(tokens & negative_words)
    return [pos_count, neg_count]

sent_features = df['clean_text'].apply(sentiment_features).tolist()

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['clean_text'])

X_numeric = np.array(sent_features)
X = hstack([X_tfidf, X_numeric])

le = LabelEncoder()
y = le.fit_transform(df['Category'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("✨ Model Evaluation Results ✨")
print("Accuracy:", accuracy)
print("Precision (Weighted):", precision)
print("Recall (Weighted):", recall)
print("F1-score (Weighted):", f1)

def predict_depression(text):
    cleaned = clean_text(text)
    vec_tfidf = vectorizer.transform([cleaned])
    pos_count = len(set(cleaned.split()) & positive_words)
    neg_count = len(set(cleaned.split()) & negative_words)
    vec = hstack([vec_tfidf, [[pos_count, neg_count]]])
    pred_class = svm_model.predict(vec)[0]
    pred_prob = svm_model.predict_proba(vec).max()
    return le.inverse_transform([pred_class])[0], pred_prob

while True:
    text = input("\nEnter a sentence (or type 'exit'): ")
    if text.lower() == 'exit':
        break
    category, confidence = predict_depression(text)
    print(f"Predicted Category: {category} (Confidence: {confidence:.2f})")


✨ Model Evaluation Results ✨
Accuracy: 0.8534554537885096
Precision (Weighted): 0.8584598122233973
Recall (Weighted): 0.8534554537885096
F1-score (Weighted): 0.8553340934721483

Enter a sentence (or type 'exit'): exit


#Streamlit

In [None]:
import json

model_metrics = {
    "Logistic Regression": {
        "Accuracy": 0.939639079029247,
        "Precision": 0.9395592511554691,
        "Recall": 0.9396006253024156,
        "F1-score": 0.9395654216734438
    },
    "LSTM": {
        "Accuracy": 0.8776,
        "Precision": 0.8844,
        "Recall": 0.8776,
        "F1-score": 0.8764
    },
    "ANN MLP": {
        "Accuracy": 0.8193,
        "Precision": 0.8190,
        "Recall": 0.8193,
        "F1-score": 0.8186
    },
    "Random Forest": {
        "Accuracy": 0.9592,
        "Precision": 0.9594,
        "Recall": 0.9592,
        "F1-score": 0.9592
    },
    "SVM": {
        "Accuracy": 0.8534554537885096,
        "Precision": 0.8584598122233973,
        "Recall": 0.8534554537885096,
        "F1-score": 0.8553340934721483
    }
}

# Save to JSON
with open("model_metrics.json", "w") as f:
    json.dump(model_metrics, f)


In [None]:
from google.colab import files

files.download("model_metrics.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>