<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install emoji
!pip install gensim



Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [25]:
import pandas as pd
import numpy as np
import re
import emoji
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("Banglish depression dataset.csv")
df.dropna(subset=['Sentence', 'Category'], inplace=True)

# Stopwords
stopwords_eng = set(stopwords.words('english'))
stopwords_bangla = {'ami','tumi','shei','amra','eto','kemon','achho','aschi','na'}
all_stopwords = stopwords_eng.union(stopwords_bangla)

# Preprocessing
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return tokens

df['Tokens'] = df['Sentence'].apply(clean_text)

# Labels
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Category'])
num_classes = len(le.classes_)
y = to_categorical(df['Label'], num_classes=num_classes)

# Load pretrained embeddings (FastText English as placeholder; replace with Banglish if available)
print("Loading pretrained FastText embeddings...")
ft_model = api.load('fasttext-wiki-news-subwords-300')  # pretrained English embeddings
embedding_dim = ft_model.vector_size

# Convert sentence to average embedding
def sentence_to_vec(tokens, model, dim):
    vecs = []
    for word in tokens:
        if word in model:
            vecs.append(model[word])
    if len(vecs) > 0:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

X = np.array([sentence_to_vec(tokens, ft_model, embedding_dim) for tokens in df['Tokens']])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build MLP model
model = Sequential()
model.add(Dense(128, input_dim=embedding_dim, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, validation_split=0.1, epochs=30, batch_size=32, verbose=2)

# Evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {acc:.4f}")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:\n", classification_report(y_true, y_pred_classes, target_names=le.classes_))

conf_mat = confusion_matrix(y_true, y_pred_classes)
print("\nConfusion Matrix:\n", conf_mat)

# --- Prediction function ---
def predict_depression(text):
    tokens = clean_text(text)
    vec = sentence_to_vec(tokens, ft_model, embedding_dim)
    vec = vec.reshape(1, -1)
    pred = model.predict(vec)
    pred_class = np.argmax(pred, axis=1)[0]
    return le.inverse_transform([pred_class])[0]

# Example interactive
while True:
    sentence = input("\nEnter text (or 'exit'): ")
    if sentence.lower() == 'exit':
        break
    print("Predicted Category:", predict_depression(sentence))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading pretrained FastText embeddings...


KeyboardInterrupt: 