<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/Banglish_Depression_classifier_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Upload Code

In [33]:
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: 

#Logistic regression

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle, resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns

file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, header=None, names=["Category", "Sentence"])
df.dropna(subset=["Sentence", "Category"], inplace=True)
df["Category"] = df["Category"].str.strip()
df = shuffle(df, random_state=42)

extra_data = [
    {'Sentence': 'ami ajke bajare jabo', 'Category': 'No Depression'},
    {'Sentence': 'ami office e jabo', 'Category': 'No Depression'},
    {'Sentence': 'ajke weather ta nice', 'Category': 'No Depression'},
    {'Sentence': 'ami valo achi', 'Category': 'No Depression'},
    {'Sentence': 'shobai kemon acho', 'Category': 'No Depression'},
    {'Sentence': 'alhamdulillah bhalo achi', 'Category': 'No Depression'},
    {'Sentence': 'ami suicide korbo', 'Category': 'Severe'},
    {'Sentence': 'ami ar bachbo na', 'Category': 'Severe'},
    {'Sentence': 'goodbye earth', 'Category': 'Severe'},
    {'Sentence': 'amar life ta khub kharap', 'Category': 'Mild'},
    {'Sentence': 'valolage na kichu', 'Category': 'Mild'},
    {'Sentence': 'sobai keno chole jay', 'Category': 'Mild'},
    {'Sentence': 'ami ar kichu korte parbo na', 'Category': 'No Depression'},
    {'Sentence': 'ami khub stressed feel kortesi', 'Category': 'Mild'}
]

df = pd.concat([df, pd.DataFrame(extra_data)], ignore_index=True)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Zআ-হ0-9\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

df["Cleaned"] = df["Sentence"].apply(clean_text)

positive_words = ['valo', 'bhalo', 'happy', 'alhamdulillah', 'nice']
negative_words = ['kharap', 'na', 'tired', 'stress', 'sad', 'suicide', 'khub']

def count_words(text, word_list):
    return sum(text.count(w) for w in word_list)

df["sent_len"] = df["Cleaned"].apply(lambda x: len(x.split()))
df["pos_count"] = df["Cleaned"].apply(lambda x: count_words(x, positive_words))
df["neg_count"] = df["Cleaned"].apply(lambda x: count_words(x, negative_words))

classes = df["Category"].unique()
max_size = df["Category"].value_counts().max()

df_balanced = pd.concat([
    resample(df[df["Category"] == cls], replace=True, n_samples=max_size, random_state=42)
    for cls in classes
])

df_balanced = shuffle(df_balanced, random_state=42)

X_text = df_balanced["Cleaned"]
X_num = df_balanced[["sent_len", "pos_count", "neg_count"]].values
y = df_balanced["Category"]

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

X_train_text, X_test_text, y_train, y_test, X_train_num, X_test_num = train_test_split(
    X_text, y, X_num, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,4))
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

model = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='sag', C=30, random_state=42)
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall    = recall_score(y_test, y_pred, average="macro")
f1        = f1_score(y_test, y_pred, average="macro")

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)



def predict_live(text):
    clean = clean_text(text)
    tfidf_vec = vectorizer.transform([clean])
    sent_len = len(clean.split())
    pos_count = count_words(clean, positive_words)
    neg_count = count_words(clean, negative_words)
    num_feat = scaler.transform([[sent_len, pos_count, neg_count]])
    combined = hstack([tfidf_vec, num_feat])
    pred = model.predict(combined)[0]
    prob = np.max(model.predict_proba(combined)) * 100
    return pred, prob

while True:
    txt = input("Enter text for prediction (or type 'exit' to quit): ").strip()
    if txt.lower() == 'exit':
        break
    if txt:
        pred, conf = predict_live(txt)
        print(f"Prediction: {pred} | Confidence: {conf:.2f}%")


#LSTM

In [None]:
!pip install emoji
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
import nltk
nltk.download('stopwords')
import gensim.downloader as api

file_name = "Banglish depression dataset.csv"
df = pd.read_csv(file_name)
df.columns = ["Category", "Sentence"]
df = df.rename(columns={'Category': 'category', 'Sentence': 'sentence'})

eng_stop = set(stopwords.words("english"))
bn_stop = {"ami", "tumi", "amra", "valo", "kharap", "ache", "achhi",
           "kintu", "na", "ar", "shob", "ekta", "kore", "shudhu"}
stop_words = eng_stop.union(bn_stop)

tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text):
    text = text.lower()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\u0980-\u09FF ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess(text):
    text = clean_text(str(text))
    tokens = tokenizer.tokenize(text)
    return [t for t in tokens if t not in stop_words and len(t) > 1]

df['tokens'] = df['sentence'].apply(preprocess)

ft_model = api.load("fasttext-wiki-news-subwords-300")
embedding_dim = ft_model.vector_size

word_index = {word: idx + 1 for idx, word in enumerate(ft_model.key_to_index)}
vocab_size = len(word_index) + 1

def tokens_to_sequence(tokens):
    return [word_index[t] for t in tokens if t in word_index]

df['seq'] = df['tokens'].apply(tokens_to_sequence)
max_len = 50
X = pad_sequences(df['seq'], maxlen=max_len)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word_index.items():
    try:
        embedding_matrix[idx] = ft_model.get_vector(word)
    except KeyError:
        continue

encoder = LabelEncoder()
y = encoder.fit_transform(df['category'])
y_cat = to_categorical(y)
num_classes = y_cat.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                    weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(128))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_classes, y_pred_classes)
precision = precision_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
recall = recall_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
f1 = f1_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)

print("✨ Model Evaluation Results ✨")
print("Accuracy:", accuracy)
print("Precision (Weighted):", precision)
print("Recall (Weighted):", recall)
print("F1-score (Weighted):", f1)


def predict_sentence_with_confidence(sentence):
    tokens = preprocess(sentence)
    seq = tokens_to_sequence(tokens)
    padded = pad_sequences([seq], maxlen=max_len)
    pred = model.predict(padded)
    class_idx = np.argmax(pred)
    class_label = encoder.inverse_transform([class_idx])[0]
    confidence = pred[0][class_idx] * 100
    return class_label, confidence

while True:
    sentence = input("Enter a Banglish sentence (or type 'exit' to quit): ")
    if sentence.lower() == 'exit':
        break
    prediction, conf = predict_sentence_with_confidence(sentence)
    print(f"Prediction: {prediction} | Confidence: {conf:.2f}%")

#ANN MLP

In [None]:
pip install pandas numpy emoji nltk gensim scikit-learn tensorflow


In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

df = pd.read_csv("Banglish depression dataset.csv")
df.dropna(subset=['Sentence', 'Category'], inplace=True)

stopwords_eng = set(stopwords.words('english'))
stopwords_bangla = {'ami','tumi','shei','amra','eto','kemon','achho','aschi','na'}
all_stopwords = stopwords_eng.union(stopwords_bangla)

def clean_text(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return tokens

df['Tokens'] = df['Sentence'].apply(clean_text)

le = LabelEncoder()
df['Label'] = le.fit_transform(df['Category'])
num_classes = len(le.classes_)
y = to_categorical(df['Label'], num_classes=num_classes)

ft_model = api.load('fasttext-wiki-news-subwords-300')
embedding_dim = ft_model.vector_size

def sentence_to_vec(tokens, model, dim):
    vecs = []
    for word in tokens:
        try:
            vecs.append(model.get_vector(word))
        except KeyError:
            continue
    if len(vecs) > 0:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

X = np.array([sentence_to_vec(tokens, ft_model, embedding_dim) for tokens in df['Tokens']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = Sequential()
model.add(Dense(128, input_dim=embedding_dim, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_split=0.1, epochs=30, batch_size=32, verbose=2)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {acc:.4f}")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

from sklearn.metrics import classification_report
print("\nClassification Report:\n", classification_report(y_true, y_pred_classes, target_names=le.classes_))

def predict_depression(text):
    tokens = clean_text(text)
    vec = sentence_to_vec(tokens, ft_model, embedding_dim)
    vec = vec.reshape(1, -1)
    pred = model.predict(vec)
    pred_class = np.argmax(pred, axis=1)[0]
    confidence = np.max(pred) * 100
    return le.inverse_transform([pred_class])[0], confidence

while True:
    sentence = input("\nEnter text (or 'exit'): ").strip()
    if sentence.lower() == 'exit':
        break
    category, conf = predict_depression(sentence)
    print(f"Predicted Category: {category} | Confidence: {conf:.2f}%")


#Random forest


In [None]:
import pandas as pd
import numpy as np
import re
import string
import ipywidgets as widgets
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

file_path = 'Banglish depression dataset.csv'
df = pd.read_csv(file_path)

df = df.dropna(subset=['Sentence', 'Category'])

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Cleaned_Sentence'] = df['Sentence'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Sentence'])
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("=== Model Performance Metrics ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

def predict_text(text):
    cleaned = preprocess_text(text)
    vec = vectorizer.transform([cleaned])
    pred = rf_model.predict(vec)[0]
    return pred

while True:
    text_input = input("Enter a Banglish sentence (or type 'exit' to quit): ")
    if text_input.lower() == 'exit':
        break
    prediction = predict_text(text_input)
    print("Prediction:", prediction)


#SVM

In [None]:
import pandas as pd
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack


df = pd.read_csv("Banglish depression dataset.csv")
df.dropna(subset=['Sentence', 'Category'], inplace=True)


stopwords_eng = {...}  # same as before
stopwords_bangla = {...}  # same as before
all_stopwords = stopwords_eng.union(stopwords_bangla)


positive_words = {'moja','happy','bhalo','fun','sundor','friend','party','mojar'}
negative_words = {'dukho','kharaap','niras','lonely','stress','dukhi','depressed'}

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return " ".join(tokens)

df['clean_text'] = df['Sentence'].apply(clean_text)


def sentiment_features(text):
    tokens = set(text.split())
    pos_count = len(tokens & positive_words)
    neg_count = len(tokens & negative_words)
    return [pos_count, neg_count]

sent_features = df['clean_text'].apply(sentiment_features).tolist()


vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['clean_text'])

import numpy as np
X_numeric = np.array(sent_features)
X = hstack([X_tfidf, X_numeric])


le = LabelEncoder()
y = le.fit_transform(df['Category'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

svm_model = SVC(kernel='rbf', probability=True)  # non-linear kernel
svm_model.fit(X_train, y_train)


y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("✨ Model Evaluation Results ✨")
print("Accuracy:", accuracy)
print("Precision (Weighted):", precision)
print("Recall (Weighted):", recall)
print("F1-score (Weighted):", f1)


def predict_depression(text):
    cleaned = clean_text(text)
    vec_tfidf = vectorizer.transform([cleaned])
    pos_count = len(set(cleaned.split()) & positive_words)
    neg_count = len(set(cleaned.split()) & negative_words)
    vec = hstack([vec_tfidf, [[pos_count, neg_count]]])
    pred_class = svm_model.predict(vec)[0]
    pred_prob = svm_model.predict_proba(vec).max()
    return le.inverse_transform([pred_class])[0], pred_prob

while True:
    text = input("\nEnter a sentence (or type 'exit'): ")
    if text.lower() == 'exit':
        break
    category, confidence = predict_depression(text)
    print(f"Predicted Category: {category} (Confidence: {confidence:.2f})")


✨ Model Evaluation Results ✨
Accuracy: 0.8534554537885096
Precision (Weighted): 0.8584598122233973
Recall (Weighted): 0.8534554537885096
F1-score (Weighted): 0.8553340934721483
