<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install emoji


Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m307.2/608.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [None]:
import pandas as pd
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from nltk.corpus import stopwords
import nltk

try:
    nltk.download('stopwords')
except:
    print("NLTK stopwords already downloaded.")

try:
    df = pd.read_csv("Banglish depression dataset.csv")
except FileNotFoundError:
    print("Error: 'Banglish depression dataset.csv' not found. Please ensure the file is in the correct directory.")
    exit()

df.dropna(subset=['Sentence', 'Category'], inplace=True)

stopwords_eng = set(stopwords.words('english'))
stopwords_bangla = {'ami','tumi','shei','amra','eto','kemon','achho','aschi','na'}
all_stopwords = stopwords_eng.union(stopwords_bangla)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in all_stopwords]
    return " ".join(tokens)

df['Cleaned_Sentence'] = df['Sentence'].apply(clean_text)

label_map = {"No Depression": 0, "Mild": 1, "Severe": 2}
df['Label'] = df['Category'].map(label_map)

if df['Label'].isnull().any():
    df.dropna(subset=['Label'], inplace=True)
    df['Label'] = df['Label'].astype(int)
else:
    df['Label'] = df['Label'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    df['Cleaned_Sentence'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(128,64), activation='relu', solver='adam', max_iter=300,
                    early_stopping=True, random_state=42, verbose=True)

print("\n" + "="*30)
print("STARTING MLP TRAINING")
print("="*30)

mlp.fit(X_train_tfidf, y_train)

print("="*30)
print("MLP TRAINING COMPLETE")
print("="*30 + "\n")

y_pred = mlp.predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
fail_count = (y_test != y_pred).sum()
fail_percent = (fail_count / len(y_test)) * 100

print("="*30)
print("EVALUATION METRICS")
print("="*30)
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Failed Predictions: {fail_count} ({fail_percent:.2f}%)")

print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_map.keys(), zero_division=0))

def predict(text):
    cleaned = clean_text(text)
    vec = tfidf.transform([cleaned])
    pred = mlp.predict(vec)[0]
    inv_map = {v:k for k,v in label_map.items()}
    return inv_map[pred]

print("="*30)
print("PREDICTION BOX")
print("="*30)
print("Type a sentence to check depression level (or 'exit'):")

while True:
    sentence = input("\nEnter text: ")
    if sentence.lower() == 'exit':
        break
    if sentence.strip() == "":
        print("Please enter some text.")
        continue

    try:
        prediction_result = predict(sentence)
        print("Predicted Category:", prediction_result)
    except Exception as e:
        print(f"An error occurred during prediction: {e}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



STARTING MLP TRAINING
Iteration 1, loss = 1.02239330
Validation score: 0.762994
Iteration 2, loss = 0.69326283
Validation score: 0.844075
Iteration 3, loss = 0.36048172
Validation score: 0.856549
Iteration 4, loss = 0.20263392
Validation score: 0.848233
Iteration 5, loss = 0.12717117
Validation score: 0.856549
Iteration 6, loss = 0.07933547
Validation score: 0.848233
Iteration 7, loss = 0.04953441
Validation score: 0.846154
