<a href="https://colab.research.google.com/github/Avniiii2606/Email-Classification-using-BERT-LDA/blob/main/Multi_label_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    hamming_loss,
    multilabel_confusion_matrix
)
import ast
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_and_prepare_data(path):
    # Load the dataset
    df = pd.read_csv(path)

    # Convert string representations of lists to actual lists
    df['text'] = df['ABSTRACT']+df['TITLE']
    df['labels'] =  df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values.tolist()

    # Transform labels to binary format
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(df['labels'])
    label_names = mlb.classes_.tolist()  # Convert NumPy array to list
    label_names = [str(label) for label in label_names] # Convert each element to string


    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'],
        labels,
        test_size=0.2,
        random_state=42
    )

    return X_train, X_test, y_train, y_test, mlb.classes_

In [None]:
def evaluate_model(y_true, y_pred, label_names):
    """
    Comprehensive evaluation of the multi-label classification model.

    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    label_names : array-like
        Names of the labels

    Returns:
    --------
    dict
        Dictionary containing all evaluation metrics
    """
    # Calculate overall metrics
    metrics = {
        'hamming_loss': hamming_loss(y_true, y_pred),
        'accuracy_score': accuracy_score(y_true, y_pred),
        'micro_precision': precision_score(y_true, y_pred, average='micro'),
        'macro_precision': precision_score(y_true, y_pred, average='macro'),
        'weighted_precision': precision_score(y_true, y_pred, average='weighted'),
        'micro_recall': recall_score(y_true, y_pred, average='micro'),
        'macro_recall': recall_score(y_true, y_pred, average='macro'),
        'weighted_recall': recall_score(y_true, y_pred, average='weighted'),
        'micro_f1': f1_score(y_true, y_pred, average='micro'),
        'macro_f1': f1_score(y_true, y_pred, average='macro'),
        'weighted_f1': f1_score(y_true, y_pred, average='weighted')
    }

    # Calculate per-label metrics
    per_label_metrics = {}
    for i, label in enumerate(label_names):
        per_label_metrics[label] = {
            'precision': precision_score(y_true[:, i], y_pred[:, i]),
            'recall': recall_score(y_true[:, i], y_pred[:, i]),
            'f1': f1_score(y_true[:, i], y_pred[:, i])
        }

    # Generate confusion matrices
    confusion_matrices = multilabel_confusion_matrix(y_true, y_pred)

    # Print detailed evaluation report
    print("\n=== Model Evaluation Report ===")
    print("\nOverall Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    print("\nPer-label Metrics:")
    print(classification_report(y_true, y_pred, target_names=label_names))

    # Visualize confusion matrices
    plot_confusion_matrices(confusion_matrices, label_names)

    # Plot label distribution
    plot_label_distribution(y_true, y_pred, label_names)

    return {
        'overall_metrics': metrics,
        'per_label_metrics': per_label_metrics,
        'confusion_matrices': confusion_matrices
    }

def plot_confusion_matrices(confusion_matrices, label_names):
    """
    Plot confusion matrices for each label.
    """
    n_labels = len(label_names)
    fig, axes = plt.subplots(2, (n_labels + 1) // 2, figsize=(15, 8))
    axes = axes.ravel()

    for i, (matrix, label) in enumerate(zip(confusion_matrices, label_names)):
        sns.heatmap(matrix, annot=True, fmt='d', ax=axes[i])
        axes[i].set_title(f'Confusion Matrix: {label}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('True')

    plt.tight_layout()
    plt.show()

def plot_label_distribution(y_true, y_pred, label_names):
    """
    Plot the distribution of true vs predicted labels.
    """
    true_dist = y_true.sum(axis=0)
    pred_dist = y_pred.sum(axis=0)

    plt.figure(figsize=(12, 6))
    x = np.arange(len(label_names))
    width = 0.35

    plt.bar(x - width/2, true_dist, width, label='True')
    plt.bar(x + width/2, pred_dist, width, label='Predicted')

    plt.xlabel('Labels')
    plt.ylabel('Count')
    plt.title('Label Distribution: True vs Predicted')
    plt.xticks(x, label_names, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
def train_and_evaluate_lda(X_train, X_test, y_train, y_test, label_names):
    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train one LDA for each label
    predictions = []
    for i in range(y_train.shape[1]):
        # Train LDA for current label
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train_tfidf.toarray(), y_train[:, i])

        # Make predictions for current label
        pred = lda.predict(X_test_tfidf.toarray())
        predictions.append(pred)

    # Combine predictions for all labels
    y_pred = np.array(predictions).T

    # Evaluate the model
    evaluation_results = evaluate_model(y_test, y_pred, label_names)

    return y_pred, evaluation_results

In [None]:
def main():
    # Load and prepare data
    path="/content/drive/MyDrive/multi_label_text.csv"
    X_train, X_test, y_train, y_test, label_names = load_and_prepare_data(path)

    # Train model, get predictions and evaluation results
    predictions, evaluation_results = train_and_evaluate_lda(
        X_train, X_test, y_train, y_test, label_names
    )

if __name__ == "__main__":
    main()


=== Model Evaluation Report ===

Overall Metrics:
hamming_loss: 0.0000
accuracy_score: 1.0000
micro_precision: 1.0000
macro_precision: 1.0000
weighted_precision: 1.0000
micro_recall: 1.0000
macro_recall: 1.0000
weighted_recall: 1.0000
micro_f1: 1.0000
macro_f1: 1.0000
weighted_f1: 1.0000

Per-label Metrics:


TypeError: object of type 'numpy.int64' has no len()