<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving Banglish depression dataset.csv to Banglish depression dataset.csv


In [2]:
pip install pandas numpy scikit-learn tensorflow nltk emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [7]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
import os
import sys
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical

# ==========================================
# 1. Setup & Resource Downloading
# ==========================================
def download_nltk_resources():
    resources = ['stopwords', 'wordnet', 'omw-1.4']
    for res in resources:
        try:
            nltk.data.find(f'corpora/{res}')
        except LookupError:
            print(f"Downloading {res}...")
            nltk.download(res, quiet=True)

download_nltk_resources()

# ==========================================
# 2. Preprocessing Logic
# ==========================================
def preprocess_text(text):
    """
    Custom preprocessing for Banglish Depression Data.
    - Preserves first-person pronouns (I, me, my).
    - Converts emojis to text.
    - Handles slang.
    """
    if not isinstance(text, str):
        return ""

    # 1. Lowercasing
    text = text.lower()

    # 2. Emoji Handling (Convert emojis to text descriptions)
    text = emoji.demojize(text, delimiters=(" ", " "))

    # 3. Noise Removal (Regex)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # URLs
    text = re.sub(r'u/\w+', '', text) # Reddit user mentions
    text = re.sub(r'@\w+', '', text)  # Twitter mentions
    text = re.sub(r'\d+', '', text)   # Numbers
    text = re.sub(r'[^\w\s]', '', text) # Punctuation

    # 4. Tokenization
    tokens = text.split()

    # 5. Stopword Removal (Customized)
    # Standard English stopwords remove "I", "me", "my". We MUST keep these.
    stop_words = set(stopwords.words('english'))
    critical_pronouns = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'}
    final_stop_words = stop_words - critical_pronouns

    tokens = [word for word in tokens if word not in final_stop_words]

    # 6. Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 7. Slang Mapping (Banglish/Net speak)
    slang_map = {
        "u": "you", "ur": "your", "r": "are", "thx": "thanks", "plz": "please",
        "brb": "be right back", "idk": "i do not know", "bhalo": "good", "kharap": "bad"
    }
    tokens = [slang_map[t] if t in slang_map else t for t in tokens]

    return " ".join(tokens)

# ==========================================
# 3. Model Training Pipeline
# ==========================================
def train_and_evaluate():
    # --- A. Load Data ---
    filename = 'Banglish depression dataset.csv'
    if not os.path.exists(filename):
        print(f"Error: {filename} not found. Please upload the file.")
        return None, None, None

    try:
        df = pd.read_csv(filename, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(filename, encoding='latin1')

    df = df.dropna(subset=['Sentence', 'Category'])

    print("Data Loaded.")
    print(f"Total samples: {len(df)}")
    print(f"Classes: {df['Category'].unique()}")

    # --- B. Split Data (80% Train, 20% Test) ---
    # Note: We split raw text FIRST to ensure strictly no data leakage in preprocessing
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
        df['Sentence'].values,
        df['Category'].values,
        test_size=0.20,
        random_state=42,
        stratify=df['Category'].values
    )

    print("\nApplying Preprocessing (Post-Split)...")
    # --- C. Apply Preprocessing ---
    X_train_clean = [preprocess_text(text) for text in X_train_raw]
    X_test_clean = [preprocess_text(text) for text in X_test_raw]

    # --- D. Label Encoding ---
    encoder = LabelEncoder()
    y_train_encoded = encoder.fit_transform(y_train_raw)
    y_test_encoded = encoder.transform(y_test_raw)

    # Convert to One-Hot for ANN
    num_classes = len(encoder.classes_)
    y_train_cat = to_categorical(y_train_encoded, num_classes=num_classes)
    y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

    # --- E. Vectorization (TF-IDF) ---
    # Fit only on Train, Transform on Test
    print("Vectorizing Text...")
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X_train_vec = vectorizer.fit_transform(X_train_clean).toarray()
    X_test_vec = vectorizer.transform(X_test_clean).toarray()

    input_dim = X_train_vec.shape[1]

    # --- F. Build ANN Model ---
    print(f"Building Model (Input features: {input_dim})...")
    model = Sequential([
        Input(shape=(input_dim,)),           # Explicit Input Layer
        Dense(128, activation='relu'),       # Hidden Layer 1
        Dropout(0.3),                        # Regularization
        Dense(64, activation='relu'),        # Hidden Layer 2
        Dropout(0.3),                        # Regularization
        Dense(num_classes, activation='softmax') # Output Layer
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # --- G. Train Model ---
    print("Training started...")
    history = model.fit(
        X_train_vec, y_train_cat,
        epochs=15,
        batch_size=32,
        validation_data=(X_test_vec, y_test_cat),
        verbose=1
    )

    # --- H. Evaluation ---
    print("\n--- Model Evaluation ---")
    y_pred_probs = model.predict(X_test_vec)
    y_pred = np.argmax(y_pred_probs, axis=1)

    acc = accuracy_score(y_test_encoded, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("\nClassification Report (Precision, Recall, F1-Score):")
    print(classification_report(y_test_encoded, y_pred, target_names=encoder.classes_))

    return model, vectorizer, encoder

# ==========================================
# 4. Console Prediction Interface
# ==========================================
def predict_console(model, vectorizer, encoder):
    """
    Console loop for user prediction.
    """
    print("\n" + "="*40)
    print("  PREDICTION MODE ENABLED")
    print("="*40)
    print("Type a sentence to check depression level.")
    print("Type 'exit' or 'quit' to stop.")
    print("-" * 40)

    while True:
        user_input = input("\nEnter text: ")

        if user_input.lower() in ['exit', 'quit']:
            print("Exiting prediction mode.")
            break

        if not user_input.strip():
            continue

        # 1. Preprocess
        clean_input = preprocess_text(user_input)

        # 2. Vectorize
        vec_input = vectorizer.transform([clean_input]).toarray()

        # 3. Predict
        probs = model.predict(vec_input, verbose=0)
        pred_idx = np.argmax(probs)
        pred_label = encoder.classes_[pred_idx]
        confidence = np.max(probs) * 100

        # 4. Output
        print(f"Prediction: {pred_label}")
        print(f"Confidence: {confidence:.2f}%")

# ==========================================
# 5. Main Execution
# ==========================================
if __name__ == "__main__":
    # 1. Train the model
    trained_model, trained_vectorizer, trained_encoder = train_and_evaluate()

    # 2. Start Prediction Loop
    if trained_model:
        predict_console(trained_model, trained_vectorizer, trained_encoder)

Downloading wordnet...
Downloading omw-1.4...
Data Loaded.
Total samples: 6003
Classes: ['No Depression' 'Mild' 'Severe']

Applying Preprocessing (Post-Split)...
Vectorizing Text...
Building Model (Input features: 5000)...
Training started...
Epoch 1/15
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5805 - loss: 0.9231 - val_accuracy: 0.8535 - val_loss: 0.3715
Epoch 2/15
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8904 - loss: 0.2879 - val_accuracy: 0.8626 - val_loss: 0.3251
Epoch 3/15
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9615 - loss: 0.1306 - val_accuracy: 0.8593 - val_loss: 0.3619
Epoch 4/15
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9845 - loss: 0.0620 - val_accuracy: 0.8518 - val_loss: 0.4308
Epoch 5/15
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9949

KeyboardInterrupt: Interrupted by user