<a href="https://colab.research.google.com/github/DasBytes/three-stage-banglish-depression-classifier/blob/main/Banglish_Depression_classifier_random_forest_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()


Saving Banglish depression dataset.csv to Banglish depression dataset.csv


In [4]:
import pandas as pd
import numpy as np
import re
import string
import ipywidgets as widgets
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# ---------------------------------------------------------
# 1. Load Dataset
# ---------------------------------------------------------
# Replace with your actual file path
file_path = 'Banglish depression dataset.csv'
df = pd.read_csv(file_path)

# Drop any accidental missing rows
df = df.dropna(subset=['Sentence', 'Category'])

# ---------------------------------------------------------
# 2. Preprocessing Function
# ---------------------------------------------------------
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Lowercasing
    text = text.lower()

    # 2. URL Removal
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Punctuation & Emoji Removal
    # (Keeps alphanumeric characters and whitespace)
    text = re.sub(r'[^\w\s]', '', text)

    # 4. Extra Space Removal
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Handling Mixed Words & Tokenization (Implicit in TF-IDF)
    # The cleaning above prepares the text for effective tokenization

    return text

# Apply preprocessing
df['Cleaned_Sentence'] = df['Sentence'].apply(preprocess_text)

# ---------------------------------------------------------
# 3. Model Training
# ---------------------------------------------------------
# Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Sentence'])
y = df['Category']

# Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# ---------------------------------------------------------
# 4. Evaluation Metrics
# ---------------------------------------------------------
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("=== Model Performance Metrics ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ---------------------------------------------------------
# 5. Live Prediction Box (Interactive)
# ---------------------------------------------------------
def predict_depression(text):
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    prediction = rf_model.predict(vectorized_text)[0]
    return prediction

# Create Widgets
text_input = widgets.Textarea(
    value='',
    placeholder='Type a sentence here (e.g., "Mone hocche more jai...")',
    description='Input Text:',
    disabled=False,
    layout=widgets.Layout(width='50%', height='80px')
)

button = widgets.Button(
    description='Predict Class',
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to predict',
    icon='check'
)

output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        user_text = text_input.value
        if user_text.strip():
            result = predict_depression(user_text)
            print(f"üìù Text: {user_text}")
            print(f"üîç Predicted Class: {result}")
        else:
            print("Please enter some text first.")

button.on_click(on_button_clicked)

print("\n=== Live Prediction Box ===")
display(text_input, button, output)

=== Model Performance Metrics ===
Accuracy:  0.8485
Precision: 0.8512
Recall:    0.8485
F1 Score:  0.8496

=== Classification Report ===
               precision    recall  f1-score   support

         Mild       0.76      0.79      0.77       382
No Depression       0.82      0.81      0.81       437
       Severe       0.98      0.95      0.97       382

     accuracy                           0.85      1201
    macro avg       0.85      0.85      0.85      1201
 weighted avg       0.85      0.85      0.85      1201


=== Live Prediction Box ===


Textarea(value='', description='Input Text:', layout=Layout(height='80px', width='50%'), placeholder='Type a s‚Ä¶

Button(button_style='info', description='Predict Class', icon='check', style=ButtonStyle(), tooltip='Click to ‚Ä¶

Output()