In [77]:
# Data Manipulation and Numerical Operations
import pandas as pd
import numpy as np

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Dataset Splitting and Feature Extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Model Evaluation
from sklearn.metrics import classification_report, accuracy_score
from sklearn.calibration import CalibratedClassifierCV

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Word Embedding Models
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

# BioBERT Model
from transformers import AutoTokenizer, AutoModel
import torch

# Cross validation
from sklearn.model_selection import cross_val_score

#NB top words
from pathlib import Path

#Lime
from lime.lime_text import LimeTextExplainer

In [2]:
# creating a preview of each csv
df1 = pd.read_csv('Medical-Abstracts-TC-Corpus/medical_tc_labels.csv')
df2 = pd.read_csv('Medical-Abstracts-TC-Corpus/medical_tc_test.csv')
df3 = pd.read_csv('Medical-Abstracts-TC-Corpus/medical_tc_train.csv')

# Preview the first few rows of each
print("\nLabels Preview:")
print(df1.head())

print("\nTest Preview:")
print(df2.head())

print("\nTrain Preview:")
print(df3.head())


Labels Preview:
   condition_label                   condition_name
0                1                        neoplasms
1                2        digestive system diseases
2                3          nervous system diseases
3                4          cardiovascular diseases
4                5  general pathological conditions

Test Preview:
   condition_label                                   medical_abstract
0                3  Obstructive sleep apnea following topical orop...
1                5  Neutrophil function and pyogenic infections in...
2                5  A phase II study of combined methotrexate and ...
3                1  Flow cytometric DNA analysis of parathyroid tu...
4                4  Paraneoplastic vasculitic neuropathy: a treata...

Train Preview:
   condition_label                                   medical_abstract
0                5  Tissue changes around loose prostheses. A cani...
1                1  Neuropeptide Y and neuron-specific enolase lev...
2         

## Text processing

First of all, a preprocessing fo the text is needed to work with more efficiency and have less issues and a better performance; in this case the operation
that are going to be implemented to the text are the following:

Lowercasing: Convert all text to lowercase to ensure uniformity.
Tokenization: Split the text into individual words or tokens.
Removing Stopwords: Eliminate common words like "and," "the," etc., that do nott add much value.
Stemming or Lemmatization: Reduce words to their root forms.
Removing Punctuation: Get rid of unnecessary symbols like periods, commas, etc.

However we need first to install the nltk library or in this case already installed in the machine

In [18]:
#!pip install nltk

In [3]:
# Downloading the necessary resources to tokenize and stopwords
nltk.download('punkt_tab', force=True)
nltk.download('stopwords')

# Initialize the stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Here is the processing function for the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word not in string.punctuation and word not in stop_words]
    
    # Apply stemming to reduce words to their root form
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a string
    return ' '.join(tokens)

# Applying preprocessing to the 'medical_abstract' column in both training and test datasets
df3['cleaned_text'] = df3['medical_abstract'].apply(preprocess_text)  # Training set
df2['cleaned_text'] = df2['medical_abstract'].apply(preprocess_text)  # Test set

# Preview the cleaned data
print("\nTraining Set with Cleaned Text:")
print(df3[['medical_abstract', 'cleaned_text']].head())

print("\nTest Set with Cleaned Text:")
print(df2[['medical_abstract', 'cleaned_text']].head())


[nltk_data] Downloading package punkt_tab to /Users/Eleve/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /Users/Eleve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Training Set with Cleaned Text:
                                    medical_abstract  \
0  Tissue changes around loose prostheses. A cani...   
1  Neuropeptide Y and neuron-specific enolase lev...   
2  Sexually transmitted diseases of the colon, re...   
3  Lipolytic factors associated with murine and h...   
4  Does carotid restenosis predict an increased r...   

                                        cleaned_text  
0  tissu chang around loos prosthes canin model i...  
1  neuropeptid neuron-specif enolas level benign ...  
2  sexual transmit diseas colon rectum anu challe...  
3  lipolyt factor associ murin human cancer cache...  
4  carotid restenosi predict increas risk late sy...  

Test Set with Cleaned Text:
                                    medical_abstract  \
0  Obstructive sleep apnea following topical orop...   
1  Neutrophil function and pyogenic infections in...   
2  A phase II study of combined methotrexate and ...   
3  Flow cytometric DNA analysis of parathyroid 

### Dataset splitting
slip the training file(df3) because the test file will be used only at the end, the training file will be divided into 80% training and 20% validation set

In [None]:
#!pip install scikit-learn

In [4]:
#Splitting the Data into Training and Validation Sets and in 2 parts each because each set has labels and input data, df3 is the training csv
X_train, X_val, y_train, y_val = train_test_split(df3['cleaned_text'], df3['condition_label'], test_size=0.2, random_state=42)

#checking if the split has worked as 80/20 
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")


Training set size: 9240
Validation set size: 2310


# Bag-of-Words (TF-IDF) representation

TF-IDF (Term Frequency-Inverse Document Frequency) will convert the text into numerical vectors based on the importance of each word in the corpus.


In [41]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit on the training set and transform both training and validation sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Show shape of the resulting matrices
print(X_train_tfidf.shape)
print(X_val_tfidf.shape)

(9240, 24928)
(2310, 24928)


## Naive Bayes (Training and Evaluation)

In [8]:
# Train Naive Bayes on the training set
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the validation set
y_pred = nb_model.predict(X_val_tfidf)

# Evaluate the model's performance
print("Naive Bayes (TF-IDF) Performance:")
print(classification_report(y_val, y_pred))


Naive Bayes (TF-IDF) Performance:
              precision    recall  f1-score   support

           1       0.72      0.61      0.66       479
           2       1.00      0.00      0.01       224
           3       0.30      0.01      0.02       295
           4       0.70      0.53      0.60       520
           5       0.41      0.78      0.54       792

    accuracy                           0.51      2310
   macro avg       0.63      0.39      0.37      2310
weighted avg       0.58      0.51      0.46      2310



## Top words

In [72]:
# ------------- config -------------
TOP_K = 10
OUTDIR = Path("figures/nb_tfidf")
OUTDIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = OUTDIR / "nb_tfidf_topwords.csv"

# ------------- get names & scores -------------
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
class_names = [str(c) for c in nb_model.classes_]  # e.g., ['1','2','3','4','5']

# log P(term | class)
scores = nb_model.feature_log_prob_.copy()  # shape [n_classes, n_features]


# ------------- build a tidy table (also saved as CSV) -------------
rows = []
for ci, cname in enumerate(class_names):
    ci_scores = scores[ci]
    top_idx = np.argsort(ci_scores)[-TOP_K:][::-1]
    for rank, j in enumerate(top_idx, start=1):
        rows.append({
            "class": cname,
            "rank": rank,
            "term": feature_names[j],
            "score": float(ci_scores[j])
        })

df = pd.DataFrame(rows)
df.to_csv(CSV_PATH, index=False)
print(f"Saved CSV -> {CSV_PATH}")

# ------------- make one plot per class -------------
for cname in class_names:
    sub = df[df["class"] == cname].sort_values("score")
    terms = sub["term"].values
    vals  = sub["score"].values

    plt.figure(figsize=(7, 4))
    plt.barh(terms, vals)
    plt.xlabel(r"$\log P(\mathrm{term}\mid \mathrm{class})$")
    plt.title(f"Naïve Bayes (TF-IDF): Top terms — class {cname}")
    plt.tight_layout()
    outpath = OUTDIR / f"nb_tfidf_topwords_{cname}.png"
    plt.savefig(outpath, dpi=220)
    plt.close()
    print(f"Saved figure -> {outpath}")

print("Done.")

Saved CSV -> figures/nb_tfidf/nb_tfidf_topwords.csv
Saved figure -> figures/nb_tfidf/nb_tfidf_topwords_1.png
Saved figure -> figures/nb_tfidf/nb_tfidf_topwords_2.png
Saved figure -> figures/nb_tfidf/nb_tfidf_topwords_3.png
Saved figure -> figures/nb_tfidf/nb_tfidf_topwords_4.png
Saved figure -> figures/nb_tfidf/nb_tfidf_topwords_5.png
Done.


## Decision Tree (Training and Evaluation)

In [17]:
# Train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Predict on the validation set
y_pred = dt_model.predict(X_val_tfidf)

# Evaluate the model's performance
print("Decision Tree (TF-IDF) Performance:")
print(classification_report(y_val, y_pred))


Decision Tree (TF-IDF) Performance:
              precision    recall  f1-score   support

           1       0.57      0.62      0.59       479
           2       0.38      0.45      0.41       224
           3       0.30      0.36      0.33       295
           4       0.52      0.52      0.52       520
           5       0.34      0.28      0.31       792

    accuracy                           0.43      2310
   macro avg       0.42      0.45      0.43      2310
weighted avg       0.43      0.43      0.43      2310



### Small decision tree for better visualization in the plot tree

In [25]:
dt_small = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_small.fit(X_train_tfidf, y_train)

In [29]:
plt.figure(figsize=(16, 8))
plot_tree(dt_small, feature_names=tfidf_vectorizer.get_feature_names_out(),
          class_names=[str(c) for c in dt_model.classes_],
          filled=True, fontsize=6)
plt.tight_layout()
plt.savefig("figures/dt_tfidf_tree.png", dpi=300)
plt.close()
print("Done.")

Done.


## Support Vector Machine (SVM) (Training and Evaluation)

In [53]:
# Train an SVM classifier
svm_model = LinearSVC(random_state=42)   # linear by design; has coef_
svm_model.fit(X_train_tfidf, y_train)

# Predict on the validation set
y_pred = svm_model.predict(X_val_tfidf)

# Evaluate the model's performance
print("SVM (TF-IDF) Performance:")
print(classification_report(y_val, y_pred))




SVM (TF-IDF) Performance:
              precision    recall  f1-score   support

           1       0.64      0.70      0.67       479
           2       0.45      0.46      0.46       224
           3       0.42      0.44      0.43       295
           4       0.61      0.61      0.61       520
           5       0.44      0.41      0.42       792

    accuracy                           0.52      2310
   macro avg       0.51      0.52      0.52      2310
weighted avg       0.52      0.52      0.52      2310



In [57]:
# feature names
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

# Output folder
outdir = Path("figures/svm_tfidf")
outdir.mkdir(parents=True, exist_ok=True)

def plot_top_features_svm(class_idx, top_k=10):
    coefs = np.asarray(svm_model.coef_[class_idx]).ravel()   # ensure dense 1D array
    top_pos = np.argsort(coefs)[-top_k:]     # most positive
    top_neg = np.argsort(coefs)[:top_k]      # most negative
    top_features = np.hstack([top_neg, top_pos])

    plt.figure(figsize=(8, 6))
    colors = ["red" if w < 0 else "blue" for w in coefs[top_features]]
    plt.barh(feature_names[top_features], coefs[top_features], color=colors)
    plt.title(f"SVM (TF–IDF): Class {class_idx+1}")
    plt.xlabel("Coefficient value")
    plt.tight_layout()

    # save inside folder
    fname = outdir / f"svm_tfidf_class{class_idx+1}.png"
    plt.savefig(fname, dpi=300)
    plt.close()
    print(f"Saved {fname}")

# Generate plots for all classes
for c in range(len(svm_model.classes_)):
    plot_top_features_svm(c, top_k=10)

print(f"All done. Plots saved in {outdir}")

Saved figures/svm_tfidf/svm_tfidf_class1.png
Saved figures/svm_tfidf/svm_tfidf_class2.png
Saved figures/svm_tfidf/svm_tfidf_class3.png
Saved figures/svm_tfidf/svm_tfidf_class4.png
Saved figures/svm_tfidf/svm_tfidf_class5.png
All done. Plots saved in figures/svm_tfidf


## Logistic Regression (Training and Evaluation)

In [61]:
# Step 1: Train a Logistic Regression model
logreg_model = LogisticRegression(
    max_iter=1000, solver="liblinear", multi_class="ovr", random_state=42
)
logreg_model.fit(X_train_tfidf, y_train)

# Step 2: Make predictions on the validation set
y_pred_logreg = logreg_model.predict(X_val_tfidf)

# Step 3: Evaluate the model's performance
print("Logistic Regression (TF-IDF) Performance:")
logreg_report = classification_report(y_val, y_pred_logreg, output_dict=True)
print(classification_report(y_val, y_pred_logreg))

Logistic Regression (TF-IDF) Performance:
              precision    recall  f1-score   support

           1       0.69      0.77      0.73       479
           2       0.57      0.38      0.46       224
           3       0.52      0.38      0.44       295
           4       0.67      0.68      0.68       520
           5       0.49      0.55      0.52       792

    accuracy                           0.59      2310
   macro avg       0.59      0.55      0.57      2310
weighted avg       0.59      0.59      0.58      2310



In [63]:
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
class_names   = [str(c) for c in logreg_model.classes_]
outdir = Path("figures/lr_tfidf"); outdir.mkdir(parents=True, exist_ok=True)

# 3) Plot top +/-K features per class and save PNGs
def plot_top_features_lr(class_idx, top_k=10):
    coefs = np.asarray(logreg_model.coef_[class_idx]).ravel()
    top_pos = np.argsort(coefs)[-top_k:]     # strongest positive words for this class
    top_neg = np.argsort(coefs)[:top_k]      # strongest negative (against this class)
    top_idx = np.hstack([top_neg, top_pos])

    plt.figure(figsize=(8, 6))
    colors = ["tab:red" if w < 0 else "tab:blue" for w in coefs[top_idx]]
    plt.barh(feature_names[top_idx], coefs[top_idx], color=colors)
    plt.title(f"Logistic Regression (TF–IDF): Class {class_names[class_idx]}")
    plt.xlabel("Coefficient")
    plt.tight_layout()
    fname = outdir / f"lr_tfidf_class_{class_names[class_idx]}.png"
    plt.savefig(fname, dpi=300); plt.close()
    print(f"Saved {fname}")

for c in range(len(class_names)):
    plot_top_features_lr(c, top_k=10)

print(f"All LR plots saved in: {outdir}")

Saved figures/lr_tfidf/lr_tfidf_class_1.png
Saved figures/lr_tfidf/lr_tfidf_class_2.png
Saved figures/lr_tfidf/lr_tfidf_class_3.png
Saved figures/lr_tfidf/lr_tfidf_class_4.png
Saved figures/lr_tfidf/lr_tfidf_class_5.png
All LR plots saved in: figures/lr_tfidf


## Cross validation

In [None]:
cv_scores_nb_tfidf = cross_val_score(nb_model, X_train_tfidf, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_dt_tfidf = cross_val_score(dt_model, X_train_tfidf, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_svm_tfidf = cross_val_score(svm_model, X_train_tfidf, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_logreg_tfidf = cross_val_score(logreg_model, X_train_tfidf, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("TF-IDF Cross-Validation Accuracy:\n"
      f"Naive Bayes: {cv_scores_nb_tfidf.mean():.4f}, "
      f"Decision Tree: {cv_scores_dt_tfidf.mean():.4f}, "
      f"SVM: {cv_scores_svm_tfidf.mean():.4f}, "
      f"Logistic Regression: {cv_scores_logreg_tfidf.mean():.4f}")

## Comparison of the result with TF-IDF

In [None]:
# Dictionary to store model results
results = {
    'Model': [],
    'Accuracy': [],
    'Precision (weighted avg)': [],
    'Recall (weighted avg)': [],
    'F1-Score (weighted avg)': []
}

# Helper function to evaluate and store model results
def evaluate_model(model_name, model, X_train, y_train, X_val, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Get classification report
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # Store the results
    results['Model'].append(model_name)
    results['Accuracy'].append(accuracy_score(y_val, y_pred))
    results['Precision (weighted avg)'].append(report['weighted avg']['precision'])
    results['Recall (weighted avg)'].append(report['weighted avg']['recall'])
    results['F1-Score (weighted avg)'].append(report['weighted avg']['f1-score'])

# 1. Multinomial Naive Bayes
nb_model = MultinomialNB()
evaluate_model("Multinomial Naive Bayes", nb_model, X_train_tfidf, y_train, X_val_tfidf, y_val)

# 2. Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_model("Decision Tree", dt_model, X_train_tfidf, y_train, X_val_tfidf, y_val)

# 3. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
evaluate_model("Support Vector Machine", svm_model, X_train_tfidf, y_train, X_val_tfidf, y_val)

# 4. Logistic Regression
logreg_model = LogisticRegression(random_state=42)
evaluate_model("Logistic Regression", logreg_model, X_train_tfidf, y_train, X_val_tfidf, y_val)


In [None]:
# Convert the results dictionary into a DataFrame
results_df_tfidf = pd.DataFrame(results)

# Set up the figure for the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(results_df_tfidf.set_index('Model').T, annot=True, cmap='viridis', linewidths=0.5, fmt=".2f")
plt.title('Comparison of Model Performance using TF-IDF Feature Representation')
plt.show()


# Word Embeddings (Word2Vec) representation

We will use the Gensim library to train a Word2Vec model and create word embeddings for the training and validation sets.
take out the comment if gensim is not already installed in your machine

check version and the inner acrchitecture of word2vec after

In [None]:
#!pip install gensim

In [65]:
df3['tokenized_text'] = df3['cleaned_text'].apply(lambda x: x.split())
df2['tokenized_text'] = df2['cleaned_text'].apply(lambda x: x.split())

# Train a Word2Vec model on the tokenized training data
word2vec_model = Word2Vec(sentences=df3['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

Now that we have a trained Word2Vec model, we need to convert each document (list of words) into a single vector. This is typically done by averaging the word vectors for each word in the document, here is how to do so:

In [66]:
# Step 1: Split the Data
X_train, X_val, y_train, y_val = train_test_split(df3['cleaned_text'], df3['condition_label'], test_size=0.2, random_state=42)

# Step 2: Tokenize the Cleaned Text for Training and Validation Sets
X_train_tokenized = X_train.apply(lambda x: x.split())
X_val_tokenized = X_val.apply(lambda x: x.split())

# Step 3: Train a Word2Vec Model on the Tokenized Training Data
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Step 4: Generate Word2Vec Embeddings for Each Document
def get_word2vec_embeddings(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)  # Return a vector of zeros if the document has no known words

# Apply the embedding function to the training and validation sets
X_train_word2vec = X_train_tokenized.apply(lambda x: get_word2vec_embeddings(x, word2vec_model))
X_val_word2vec = X_val_tokenized.apply(lambda x: get_word2vec_embeddings(x, word2vec_model))

# Convert the results to NumPy arrays for use in the models
X_train_word2vec = np.vstack(X_train_word2vec)
X_val_word2vec = np.vstack(X_val_word2vec)

# Print shapes to verify consistency
print("X_train_word2vec shape:", X_train_word2vec.shape)
print("y_train shape:", y_train.shape)
print("X_val_word2vec shape:", X_val_word2vec.shape)
print("y_val shape:", y_val.shape)


X_train_word2vec shape: (9240, 100)
y_train shape: (9240,)
X_val_word2vec shape: (2310, 100)
y_val shape: (2310,)


## Gaussian Naive Bayes with Word2Vec

To use Naive Bayes with Word2Vec, we will use Gaussian Naive Bayes, which is designed for continuous data like the dense embeddings from Word2Vec.

In [69]:
# Train a Gaussian Naive Bayes classifier using Word2Vec embeddings
gnb_model = GaussianNB()
gnb_model.fit(X_train_word2vec, y_train)

# Predict on the validation set
y_pred_gnb = gnb_model.predict(X_val_word2vec)

# Evaluate the model's performance
print("Gaussian Naive Bayes (Word2Vec) Performance:")
print(classification_report(y_val, y_pred_gnb))


Gaussian Naive Bayes (Word2Vec) Performance:
              precision    recall  f1-score   support

           1       0.62      0.70      0.66       479
           2       0.34      0.43      0.38       224
           3       0.31      0.59      0.41       295
           4       0.59      0.71      0.64       520
           5       0.45      0.18      0.25       792

    accuracy                           0.48      2310
   macro avg       0.46      0.52      0.47      2310
weighted avg       0.49      0.48      0.46      2310



In [46]:
OUTDIR = Path("figures/nb_word2vec_lime")
OUTDIR.mkdir(parents=True, exist_ok=True)

class_names = [str(c) for c in gnb_model.classes_]
label_to_idx = {c: i for i, c in enumerate(gnb_model.classes_)}  # if you ever need true->idx

explainer = LimeTextExplainer(class_names=class_names)

sample_indices = [0, 1, 2]  # pick any you like
for i in sample_indices:
    raw_text = X_val.iloc[i] if hasattr(X_val, "iloc") else X_val[i]

    # 1) get predicted label index for THIS text
    proba = predict_proba_word2vec([raw_text])[0]
    pred_idx = int(np.argmax(proba))

    # 2) ask LIME to explain that specific label
    exp = explainer.explain_instance(
        raw_text,
        predict_proba_word2vec,
        num_features=10,
        labels=[pred_idx]          # <-- key fix
    )

    # 3) save interactive HTML
    exp.save_to_file(str(OUTDIR / f"nb_word2vec_lime_{i+1}.html"))

    # 4) save static PNG for the same label
    token_weights = exp.as_list(label=pred_idx)  # use predicted label index
    if token_weights:
        terms, weights = zip(*token_weights)
        plt.figure(figsize=(8, 4))
        plt.barh(list(terms)[::-1], list(weights)[::-1])
        plt.xlabel("LIME weight (contribution to predicted class)")
        plt.title(f"LIME — Word2Vec + GaussianNB (sample {i+1})\nPredicted: {class_names[pred_idx]}")
        plt.tight_layout()
        plt.savefig(OUTDIR / f"nb_word2vec_lime_{i+1}.png", dpi=220)
        plt.close()

    print(f"Saved HTML/PNG for sample {i+1} (pred: {class_names[pred_idx]})")

Saved HTML/PNG for sample 1 (pred: 3)
Saved HTML/PNG for sample 2 (pred: 1)
Saved HTML/PNG for sample 3 (pred: 5)


## Decision Tree with Word2Vec

In [71]:
# Train a Decision Tree classifier using Word2Vec embeddings
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_word2vec, y_train)

# Predict on the validation set
y_pred_dt = dt_model.predict(X_val_word2vec)

# Evaluate the model's performance
print("Decision Tree (Word2Vec) Performance:")
print(classification_report(y_val, y_pred_dt))


Decision Tree (Word2Vec) Performance:
              precision    recall  f1-score   support

           1       0.50      0.55      0.52       479
           2       0.20      0.25      0.22       224
           3       0.21      0.24      0.23       295
           4       0.46      0.45      0.45       520
           5       0.27      0.23      0.25       792

    accuracy                           0.35      2310
   macro avg       0.33      0.34      0.33      2310
weighted avg       0.35      0.35      0.35      2310



In [73]:
OUTDIR = Path("figures/dt_word2vec")
OUTDIR.mkdir(parents=True, exist_ok=True)

# ----- train a shallow tree for visualization
dt_shallow = DecisionTreeClassifier(
    max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42
)
dt_shallow.fit(X_train_word2vec, y_train)

# ----- feature & class names
d = X_train_word2vec.shape[1]                # e.g., 100
feat_names = [f"w2v_dim_{i}" for i in range(d)]
class_names = [str(c) for c in dt_shallow.classes_]

# ----- plot and save the tree
plt.figure(figsize=(14, 10))
plot_tree(
    dt_shallow,
    feature_names=feat_names,
    class_names=class_names,
    filled=True,
    rounded=True,
    impurity=True,
    proportion=True,
    fontsize=9,
)
plt.tight_layout()
plt.savefig(OUTDIR / "dt_w2v_tree.png", dpi=300)
plt.close()
print("Saved:", OUTDIR / "dt_w2v_tree.png")

Saved: figures/dt_word2vec/dt_w2v_tree.png


## Support Vector Machine (SVM) with Word2Vec

In [75]:
# Train an SVM classifier using Word2Vec embeddings
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_word2vec, y_train)

# Predict on the validation set
y_pred_svm = svm_model.predict(X_val_word2vec)

# Evaluate the model's performance
print("SVM (Word2Vec) Performance:")
print(classification_report(y_val, y_pred_svm))


SVM (Word2Vec) Performance:
              precision    recall  f1-score   support

           1       0.69      0.73      0.71       479
           2       0.52      0.29      0.37       224
           3       0.53      0.31      0.39       295
           4       0.68      0.67      0.67       520
           5       0.47      0.59      0.52       792

    accuracy                           0.57      2310
   macro avg       0.58      0.52      0.53      2310
weighted avg       0.58      0.57      0.56      2310



In [81]:
svm_cal = CalibratedClassifierCV(estimator=svm_model, cv=5)
svm_cal.fit(X_train_word2vec, y_train)
class_names = [str(c) for c in svm_cal.classes_]

# 1) Adapter that reuses YOUR embedding function (no new helper logic)
emb_dim = X_train_word2vec.shape[1]

def predict_proba_from_text(texts):
    # reuse your function: it expects tokens, so pass text.split()
    vecs = [get_word2vec_embeddings(t.split(), word2vec_model) for t in texts]
    X = np.vstack(vecs).reshape(len(vecs), emb_dim)
    return svm_cal.predict_proba(X)

# 2) Choose one representative validation sample per class (prefer model-predicted)
OUTDIR = Path("figures/svm_word2vec_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "svm_w2v_lime_5classes.png"
TOP_K = 10

explainer = LimeTextExplainer(class_names=class_names)

probas_all = predict_proba_from_text(list(X_val if hasattr(X_val, "__iter__") else X_val.values))
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])   # predicted as class i
    else:
        # fallback: first ground-truth example of class i
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 3) Build one PNG with 5 token-importance subplots (readable words)
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for i, ax in enumerate(axes[:5]):
    idx = chosen[i]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(
        text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx]
    )
    pairs = exp.as_list(label=pred_idx)
    terms, weights = zip(*pairs) if pairs else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved SVM+Word2Vec LIME figure -> {PNG_PATH}")

Saved SVM+Word2Vec LIME figure -> figures/svm_word2vec_lime/svm_w2v_lime_5classes.png


## Logistic Regression with Word2Vec

In [83]:
# Train a Logistic Regression classifier using Word2vec embeddings
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train_word2vec, y_train)

# Predict on the validatio set
y_pred_logreg = logreg_model.predict(X_val_word2vec)

# Evaluate the model's performance
print("Logistic Regression (Word2Vec) Performance:")
print(classification_report(y_val, y_pred_logreg))

Logistic Regression (Word2Vec) Performance:
              precision    recall  f1-score   support

           1       0.70      0.70      0.70       479
           2       0.54      0.31      0.40       224
           3       0.51      0.37      0.43       295
           4       0.68      0.66      0.67       520
           5       0.47      0.59      0.52       792

    accuracy                           0.57      2310
   macro avg       0.58      0.52      0.54      2310
weighted avg       0.58      0.57      0.57      2310



In [85]:
OUTDIR = Path("figures/lr_word2vec_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "lr_w2v_lime_5classes.png"
TOP_K = 10

class_names = [str(c) for c in logreg_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)
emb_dim = X_train_word2vec.shape[1]

# --- Adapter: raw text -> your Word2Vec doc vector -> predict_proba
def predict_proba_from_text(texts):
    vecs = [get_word2vec_embeddings(t.split(), word2vec_model) for t in texts]
    X = np.vstack(vecs).reshape(len(vecs), emb_dim)
    return logreg_model.predict_proba(X)

# --- Pick one representative validation sample per class (prefer model-predicted)
probas_all = predict_proba_from_text(list(X_val if hasattr(X_val, "__iter__") else X_val.values))
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])
    else:
        # fallback: first ground-truth example of that class
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# --- Build and save the 5-subplot PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for i, ax in enumerate(axes[:5]):
    idx = chosen[i]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(
        text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx]
    )
    items = exp.as_list(label=pred_idx)
    terms, weights = zip(*items) if items else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved LR+Word2Vec LIME figure -> {PNG_PATH}")


Saved LR+Word2Vec LIME figure -> figures/lr_word2vec_lime/lr_w2v_lime_5classes.png


## Cross validation

In [None]:
cv_scores_nb_word2vec = cross_val_score(gnb_model, X_train_word2vec, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_dt_word2vec = cross_val_score(dt_model, X_train_word2vec, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_svm_word2vec = cross_val_score(svm_model, X_train_word2vec, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_logreg_word2vec = cross_val_score(logreg_model, X_train_word2vec, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print("Word2Vec Cross-Validation Accuracy:\n"
      f"Naive Bayes: {cv_scores_nb_word2vec.mean():.4f}, "
      f"Decision Tree: {cv_scores_dt_word2vec.mean():.4f}, "
      f"SVM: {cv_scores_svm_word2vec.mean():.4f}, "
      f"Logistic Regression: {cv_scores_logreg_word2vec.mean():.4f}")


## Comparison of the result with Word2vec

In [None]:
# Dictionary to store model results
results_word2vec = {
    'Model': [],
    'Accuracy': [],
    'Precision (weighted avg)': [],
    'Recall (weighted avg)': [],
    'F1-Score (weighted avg)': []
}

# Helper function to evaluate and store model results
def evaluate_model_word2vec(model_name, model, X_train, y_train, X_val, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Get classification report
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # Store the results
    results_word2vec['Model'].append(model_name)
    results_word2vec['Accuracy'].append(accuracy_score(y_val, y_pred))
    results_word2vec['Precision (weighted avg)'].append(report['weighted avg']['precision'])
    results_word2vec['Recall (weighted avg)'].append(report['weighted avg']['recall'])
    results_word2vec['F1-Score (weighted avg)'].append(report['weighted avg']['f1-score'])

# 1. Gaussian Naive Bayes
nb_model = GaussianNB()
evaluate_model_word2vec("Gaussian Naive Bayes", nb_model, X_train_word2vec, y_train, X_val_word2vec, y_val)

# 2. Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_model_word2vec("Decision Tree", dt_model, X_train_word2vec, y_train, X_val_word2vec, y_val)

# 3. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
evaluate_model_word2vec("Support Vector Machine", svm_model, X_train_word2vec, y_train, X_val_word2vec, y_val)

# 4. Logistic Regression
logreg_model = LogisticRegression(max_iter=500, random_state=42)
evaluate_model_word2vec("Logistic Regression", logreg_model, X_train_word2vec, y_train, X_val_word2vec, y_val)

In [None]:
# Convert the results dictionary into a DataFrame
results_df_word2vec = pd.DataFrame(results_word2vec)

# Set up the figure for the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(results_df_word2vec.set_index('Model').T, annot=True, cmap='viridis', linewidths=0.5, fmt=".2f")
plt.title('Comparison of Model Performance using Word2Vec Feature Representation')
plt.show()


# Word Embeddings (GloVe) representation

GloVe creates word embeddings by analyzing the global co-occurrence of words across a corpus, while Word2Vec uses a predictive approach that learns word representations based on their local context (neighboring words). GloVe focuses on capturing global relationships between words, whereas Word2Vec relies on learning patterns through context-based predictions.

## Load GloVe embeddings

They provide a way to represent words in a numerical form that can capture semantic meanings and relationships. These embeddings, which are pre-trained on large text corpora, help the machine learning models understand the context and relationships between words without needing to train embeddings from scratch. By loading GloVe embeddings, we can leverage this pre-trained knowledge to improve the performance of our models in text classification tasks like the one in your project.

In [9]:
def load_glove_vectors(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_file_path = "GloVe-files/glove.6B.100d.txt" 
glove_embeddings = load_glove_vectors(glove_file_path)

The choice of usage of 100 dimensional GloVe vectors instead is due to the capacity to capture important information without requiring an excessive amount of computational power.
Higher-dimensional vectors (e.g., 200d or 300d) might offer more detailed word relationships, but they also come with increased memory and processing requirements.

## Convert cleaned text to GloVe embeddings

In [10]:
def get_glove_embeddings(text, glove_embeddings, embedding_dim=100):
    words = text.split()
    vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

## Apply the GloVe embeddings to training and validation datasets

In [13]:
X_train_glove = np.vstack(X_train.apply(lambda x: get_glove_embeddings(x, glove_embeddings)))
X_val_glove = np.vstack(X_val.apply(lambda x: get_glove_embeddings(x, glove_embeddings)))

## Train and evaluate the models

The text has been treated in order to be compatible with the models, Naives Bayes, Decision Tree and SVM

### Naive Bayes

In [15]:
gnb_model = GaussianNB()
gnb_model.fit(X_train_glove, y_train)
y_pred_gnb = gnb_model.predict(X_val_glove)
print("GloVe + Gaussian Naive Bayes Performance:")
print(classification_report(y_val, y_pred_gnb))

GloVe + Gaussian Naive Bayes Performance:
              precision    recall  f1-score   support

           1       0.53      0.59      0.56       479
           2       0.28      0.26      0.27       224
           3       0.28      0.47      0.35       295
           4       0.46      0.56      0.51       520
           5       0.41      0.22      0.29       792

    accuracy                           0.41      2310
   macro avg       0.39      0.42      0.40      2310
weighted avg       0.42      0.41      0.40      2310



In [17]:
OUTDIR = Path("figures/nb_glove_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "nb_glove_lime_5classes.png"

# -- embed raw texts to averaged GloVe vectors (same logic you used before)
def embed_glove_texts(texts, embedding_dim=100):
    vecs = []
    for t in texts:
        words = t.split()
        wvecs = [glove_embeddings[w] for w in words if w in glove_embeddings]
        vecs.append(np.mean(wvecs, axis=0) if wvecs else np.zeros(embedding_dim, dtype=np.float32))
    return np.vstack(vecs)

def predict_proba_glove(raw_texts):
    X = embed_glove_texts(raw_texts, embedding_dim=100)  # change dim if needed
    return gnb_model.predict_proba(X)

class_names = [str(c) for c in gnb_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)

# -- choose one representative index per class
probas = predict_proba_glove(list(X_val if hasattr(X_val, "__iter__") else X_val.values))
pred_idxs = np.argmax(probas, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:             # prefer a sample predicted as class i
        chosen.append(indices_by_class[i][0])
    else:
        # fallback: first ground-truth example of this class
        # (works if y_val is pandas Series or list)
        try:
            # if labels are like [1..5], map to index position
            true_indices = [k for k, y in enumerate(list(y_val)) if str(y) == class_names[i]]
        except Exception:
            true_indices = []
        chosen.append(true_indices[0] if true_indices else 0)

# -- build the 5-subplot figure
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

top_k = 10
for i, ax in enumerate(axes[:5]):
    idx = chosen[i]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]

    # explain for the predicted label of this sample
    pred_idx = int(np.argmax(predict_proba_glove([text])[0]))
    exp = explainer.explain_instance(text, predict_proba_glove, num_features=top_k, labels=[pred_idx])
    tok_w = exp.as_list(label=pred_idx)

    terms, weights = zip(*tok_w) if tok_w else (["(no tokens)"], [0.0])
    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

# hide the unused 6th subplot (bottom-right)
axes[-1].axis("off")

plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved 5-class LIME figure -> {PNG_PATH}")

Saved 5-class LIME figure -> figures/nb_glove_lime/nb_glove_lime_5classes.png


### Decision Tree

In [87]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_glove, y_train)
y_pred_dt = dt_model.predict(X_val_glove)
print("GloVe + Decision Tree Performance:")
print(classification_report(y_val, y_pred_dt))

GloVe + Decision Tree Performance:
              precision    recall  f1-score   support

           1       0.35      0.40      0.38       479
           2       0.06      0.08      0.07       224
           3       0.14      0.18      0.16       295
           4       0.31      0.27      0.29       520
           5       0.23      0.18      0.20       792

    accuracy                           0.24      2310
   macro avg       0.22      0.22      0.22      2310
weighted avg       0.25      0.24      0.24      2310



In [89]:
OUTDIR = Path("figures/dt_glove")
OUTDIR.mkdir(parents=True, exist_ok=True)

# Train a small interpretable tree
dt_shallow = DecisionTreeClassifier(
    max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42
)
dt_shallow.fit(X_train_glove, y_train)

# Names
d = X_train_glove.shape[1]
feat_names = [f"glove_dim_{i}" for i in range(d)]
class_names = [str(c) for c in dt_shallow.classes_]

# Plot & save
plt.figure(figsize=(14, 10))
plot_tree(
    dt_shallow,
    feature_names=feat_names,
    class_names=class_names,
    filled=True,
    rounded=True,
    impurity=True,
    proportion=True,
    fontsize=9,
)
plt.tight_layout()
plt.savefig(OUTDIR / "dt_glove_tree.png", dpi=300)
plt.close()
print("Saved:", OUTDIR / "dt_glove_tree.png")

Saved: figures/dt_glove/dt_glove_tree.png


### Support Vector Machine (SVM)

In [91]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_glove, y_train)
y_pred_svm = svm_model.predict(X_val_glove)
print("GloVe + Support Vector Machine Performance:")
print(classification_report(y_val, y_pred_svm))

GloVe + Support Vector Machine Performance:
              precision    recall  f1-score   support

           1       0.61      0.68      0.64       479
           2       0.11      0.00      0.01       224
           3       0.56      0.20      0.30       295
           4       0.59      0.54      0.57       520
           5       0.43      0.65      0.52       792

    accuracy                           0.51      2310
   macro avg       0.46      0.41      0.41      2310
weighted avg       0.49      0.51      0.48      2310



In [93]:
svm_cal = CalibratedClassifierCV(estimator=svm_model, cv=5)  # use 'estimator' on sklearn >=1.2
svm_cal.fit(X_train_glove, y_train)
class_names = [str(c) for c in svm_cal.classes_]

# 1) Adapter: raw text -> your GloVe doc vector -> predict_proba
emb_dim = X_train_glove.shape[1]

def predict_proba_from_text(texts):
    vecs = [get_glove_embeddings(t, glove_embeddings, embedding_dim=emb_dim) for t in texts]
    X = np.vstack(vecs).reshape(len(vecs), emb_dim)
    return svm_cal.predict_proba(X)

# 2) Pick one representative validation sample per class (prefer ones predicted as that class)
OUTDIR = Path("figures/svm_glove_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "svm_glove_lime_5classes.png"
TOP_K = 10

explainer = LimeTextExplainer(class_names=class_names)

raw_val = list(X_val if hasattr(X_val, "__iter__") else X_val.values)
probas_all = predict_proba_from_text(raw_val)
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])    # sample predicted as class i
    else:
        # fallback: first ground-truth example of class i
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 3) Build & save the 5-panel PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=( Twelve := 12, Ten := 10 ))  # small trick to keep style
axes = axes.ravel()

for k, ax in enumerate(axes[:5]):
    idx = chosen[k]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx])
    pairs = exp.as_list(label=pred_idx)
    terms, weights = zip(*pairs) if pairs else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved SVM+GloVe LIME figure -> {PNG_PATH}")

Saved SVM+GloVe LIME figure -> figures/svm_glove_lime/svm_glove_lime_5classes.png


### Logistic Regression

In [94]:
logreg_model = LogisticRegression(max_iter=1000, random_state = 42)
logreg_model.fit(X_train_glove, y_train)
y_pred_logreg = logreg_model.predict(X_val_glove)
print("GloVe + Logistic Regression Performance:")
print(classification_report(y_val, y_pred_logreg))

GloVe + Logistic Regression Performance:
              precision    recall  f1-score   support

           1       0.63      0.66      0.64       479
           2       0.45      0.22      0.29       224
           3       0.49      0.30      0.37       295
           4       0.58      0.55      0.57       520
           5       0.44      0.57      0.49       792

    accuracy                           0.51      2310
   macro avg       0.52      0.46      0.47      2310
weighted avg       0.52      0.51      0.51      2310



In [97]:
OUTDIR = Path("figures/lr_glove_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "lr_glove_lime_5classes.png"
TOP_K = 10

class_names = [str(c) for c in logreg_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)
emb_dim = X_train_glove.shape[1]

# Adapter: raw text -> your GloVe doc vector -> predict_proba
def predict_proba_from_text(texts):
    vecs = [get_glove_embeddings(t, glove_embeddings, embedding_dim=emb_dim) for t in texts]
    X = np.vstack(vecs).reshape(len(vecs), emb_dim)
    return logreg_model.predict_proba(X)

# Pick one representative validation sample per class (prefer model-predicted examples)
raw_val = list(X_val if hasattr(X_val, "__iter__") else X_val.values)
probas_all = predict_proba_from_text(raw_val)
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])
    else:
        # fallback to first ground-truth example of that class
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# Build & save the 5-panel PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for k, ax in enumerate(axes[:5]):
    idx = chosen[k]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx])
    items = exp.as_list(label=pred_idx)
    terms, weights = zip(*items) if items else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved LR+GloVe LIME figure -> {PNG_PATH}")

Saved LR+GloVe LIME figure -> figures/lr_glove_lime/lr_glove_lime_5classes.png


## Cross validation

In [None]:
cv_scores_nb_glove = cross_val_score(gnb_model, X_train_glove, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_dt_glove = cross_val_score(dt_model, X_train_glove, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_svm_glove = cross_val_score(svm_model, X_train_glove, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_logreg_glove = cross_val_score(logreg_model, X_train_glove, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print("GloVe Cross-Validation Accuracy:\n"
      f"Naive Bayes: {cv_scores_nb_glove.mean():.4f}, "
      f"Decision Tree: {cv_scores_dt_glove.mean():.4f}, "
      f"SVM: {cv_scores_svm_glove.mean():.4f}, "
      f"Logistic Regression: {cv_scores_logreg_glove.mean():.4f}")


## Comparison of results with GloVe

In [None]:
# Dictionary to store GloVe model results
results_glove = {
    'Model': [],
    'Accuracy': [],
    'Precision (weighted avg)': [],
    'Recall (weighted avg)': [],
    'F1-Score (weighted avg)': []
}

# Helper function to evaluate and store GloVe model results
def evaluate_model_glove(model_name, model, X_train, y_train, X_val, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Get classification report
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # Store the results
    results_glove['Model'].append(model_name)
    results_glove['Accuracy'].append(accuracy_score(y_val, y_pred))
    results_glove['Precision (weighted avg)'].append(report['weighted avg']['precision'])
    results_glove['Recall (weighted avg)'].append(report['weighted avg']['recall'])
    results_glove['F1-Score (weighted avg)'].append(report['weighted avg']['f1-score'])

# 1. Gaussian Naive Bayes
nb_model = GaussianNB()
evaluate_model_glove("Gaussian Naive Bayes", nb_model, X_train_glove, y_train, X_val_glove, y_val)

# 2. Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_model_glove("Decision Tree", dt_model, X_train_glove, y_train, X_val_glove, y_val)

# 3. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
evaluate_model_glove("Support Vector Machine", svm_model, X_train_glove, y_train, X_val_glove, y_val)

# 4. Logistic Regression
logreg_model = LogisticRegression(max_iter = 1000, random_state = 42)
evaluate_model_glove("Logistic Regression", logreg_model, X_train_glove, y_train, X_val_glove, y_val)

In [None]:
# Convert the results dictionary into a DataFrame
results_df_glove = pd.DataFrame(results_glove)

# Set up the figure for the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(results_df_glove.set_index('Model').T, annot=True, cmap='viridis', linewidths=0.5, fmt=".2f")
plt.title('Comparison of Model Performance using GloVe Feature Representation')
plt.show()


# SBERT model implementation

In [None]:
# pip install sentence-transformers
# pip install ipywidgets

### Import SBERT and Load Pre-trained Model

In [19]:
# Load a pre-trained SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

README.md: 0.00B [00:00, ?B/s]

## Generating Sentence Embeddings

In [20]:
# Generate sentence embeddings for the training and validation sets
X_train_embeddings = sbert_model.encode(X_train.tolist(), convert_to_tensor=True)
X_val_embeddings = sbert_model.encode(X_val.tolist(), convert_to_tensor=True)

# Convert embeddings to numpy arrays (optional)
X_train_embeddings = X_train_embeddings.cpu().numpy()
X_val_embeddings = X_val_embeddings.cpu().numpy()

## Train and Evaluate Models

### Naive Bayes

In [21]:
# Train Gaussian Naive Bayes using SBERT embeddings
gnb_model = GaussianNB()
gnb_model.fit(X_train_embeddings, y_train)

# Predict on validation set
y_pred_gnb = gnb_model.predict(X_val_embeddings)

# Evaluate performance
print("SBERT + Gaussian Naive Bayes Performance:")
print(classification_report(y_val, y_pred_gnb))


SBERT + Gaussian Naive Bayes Performance:
              precision    recall  f1-score   support

           1       0.70      0.79      0.74       479
           2       0.46      0.67      0.55       224
           3       0.48      0.62      0.54       295
           4       0.63      0.81      0.71       520
           5       0.58      0.29      0.38       792

    accuracy                           0.59      2310
   macro avg       0.57      0.64      0.59      2310
weighted avg       0.59      0.59      0.57      2310



In [25]:

OUTDIR = Path("figures/nb_sbert_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "nb_sbert_lime_5classes.png"
TOP_K = 10

# 1) Embed raw texts with your existing SBERT model
def embed_sbert(texts):
    # Use your same model; returns np.ndarray [n, d]
    return sbert_model.encode(list(texts), convert_to_numpy=True)

# 2) Predict-proba wrapper for LIME (raw text -> proba)
def predict_proba_sbert(raw_texts):
    X = embed_sbert(raw_texts)
    return gnb_model.predict_proba(X)

class_names = [str(c) for c in gnb_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)

# 3) Choose one representative sample per class
probas = predict_proba_sbert(list(X_val if hasattr(X_val, "__iter__") else X_val.values))
pred_idxs = np.argmax(probas, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

# prefer a sample predicted as that class; fallback to first ground-truth of that class
y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])
    else:
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 4) Build the 5-subplot figure
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for i, ax in enumerate(axes[:5]):
    idx = chosen[i]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]

    # explain for the predicted label of this sample
    pred_idx = int(np.argmax(predict_proba_sbert([text])[0]))
    exp = explainer.explain_instance(text, predict_proba_sbert, num_features=TOP_K, labels=[pred_idx])
    tok_w = exp.as_list(label=pred_idx)
    terms, weights = zip(*tok_w) if tok_w else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")  # hide empty 6th tile
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved 5-class LIME figure -> {PNG_PATH}")

Saved 5-class LIME figure -> figures/nb_sbert_lime/nb_sbert_lime_5classes.png


### Logistic Regression

In [99]:
# Train Logistic Regression using SBERT embeddings
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train_embeddings, y_train)

# Predict on validation set
y_pred_logreg = logreg_model.predict(X_val_embeddings)

# Evaluate performance
print("SBERT + Logistic Regression Performance:")
print(classification_report(y_val, y_pred_logreg))

SBERT + Logistic Regression Performance:
              precision    recall  f1-score   support

           1       0.72      0.79      0.75       479
           2       0.55      0.49      0.52       224
           3       0.57      0.48      0.52       295
           4       0.69      0.76      0.72       520
           5       0.56      0.54      0.55       792

    accuracy                           0.63      2310
   macro avg       0.62      0.61      0.61      2310
weighted avg       0.62      0.63      0.62      2310



In [101]:
OUTDIR = Path("figures/lr_sbert_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "lr_sbert_lime_5classes.png"
TOP_K = 10

class_names = [str(c) for c in logreg_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)

emb_dim = X_train_embeddings.shape[1]

# Adapter: raw text -> SBERT embedding -> predict_proba
def predict_proba_from_text(texts):
    # SBERT encode returns numpy if convert_to_tensor=False
    X = sbert_model.encode(list(texts), convert_to_tensor=False, batch_size=64, show_progress_bar=False)
    X = np.asarray(X).reshape(len(texts), emb_dim)
    return logreg_model.predict_proba(X)

# Choose one representative validation sample per class (prefer model-predicted)
raw_val = list(X_val if hasattr(X_val, "__iter__") else X_val.values)
probas_all = predict_proba_from_text(raw_val)
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])
    else:
        # fallback: first ground-truth example of that class
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# Build & save the 5-panel PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for k, ax in enumerate(axes[:5]):
    idx = chosen[k]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(
        text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx]
    )
    items = exp.as_list(label=pred_idx)
    terms, weights = zip(*items) if items else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved LR+SBERT LIME figure -> {PNG_PATH}")

Saved LR+SBERT LIME figure -> figures/lr_sbert_lime/lr_sbert_lime_5classes.png


### Support Vector Machine

In [102]:
# Train SVM using SBERT embeddings
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_embeddings, y_train)

# Predict on validation set
y_pred_svm = svm_model.predict(X_val_embeddings)

# Evaluate performance
print("SBERT + SVM Performance:")
print(classification_report(y_val, y_pred_svm))

SBERT + SVM Performance:
              precision    recall  f1-score   support

           1       0.71      0.82      0.76       479
           2       0.55      0.54      0.54       224
           3       0.55      0.47      0.51       295
           4       0.68      0.79      0.73       520
           5       0.57      0.49      0.53       792

    accuracy                           0.63      2310
   macro avg       0.61      0.62      0.61      2310
weighted avg       0.62      0.63      0.62      2310



In [107]:
svm_cal = CalibratedClassifierCV(estimator=svm_model, cv=5)  # use 'estimator' on sklearn >=1.2
svm_cal.fit(X_train_embeddings, y_train)
class_names = [str(c) for c in svm_cal.classes_]

OUTDIR = Path("figures/svm_sbert_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "svm_sbert_lime_5classes.png"
TOP_K = 10

explainer = LimeTextExplainer(class_names=class_names)
emb_dim = X_train_embeddings.shape[1]

# 1) Adapter: raw text -> SBERT embedding -> predict_proba
def predict_proba_from_text(texts):
    X = sbert_model.encode(list(texts), convert_to_tensor=False, batch_size=64, show_progress_bar=False)
    X = np.asarray(X).reshape(len(texts), emb_dim)
    return svm_cal.predict_proba(X)

# 2) Pick one representative validation sample per class (prefer model-predicted)
raw_val = list(X_val if hasattr(X_val, "__iter__") else X_val.values)
probas_all = predict_proba_from_text(raw_val)
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])    # predicted as that class
    else:
        # fallback: first ground-truth example of that class
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 3) Build and save the 5-panel PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for k, ax in enumerate(axes[:5]):
    idx = chosen[k]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(
        text, predict_proba_from_text, num_features=TOP_K, labels=[pred_idx]
    )
    items = exp.as_list(label=pred_idx)
    terms, weights = zip(*items) if items else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved SVM+SBERT LIME figure -> {PNG_PATH}")

Saved SVM+SBERT LIME figure -> figures/svm_sbert_lime/svm_sbert_lime_5classes.png


### Decision Tree

In [103]:
# Train Decision Tree using SBERT embeddings
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_embeddings, y_train)

# Predict on validation set
y_pred_dt = dt_model.predict(X_val_embeddings)

# Evaluate performance
print("SBERT + Decision Tree Performance:")
print(classification_report(y_val, y_pred_dt))

SBERT + Decision Tree Performance:
              precision    recall  f1-score   support

           1       0.48      0.55      0.51       479
           2       0.19      0.23      0.21       224
           3       0.18      0.20      0.19       295
           4       0.47      0.45      0.46       520
           5       0.29      0.24      0.26       792

    accuracy                           0.35      2310
   macro avg       0.32      0.34      0.33      2310
weighted avg       0.35      0.35      0.35      2310



In [109]:
OUTDIR = Path("figures/dt_sbert")
OUTDIR.mkdir(parents=True, exist_ok=True)

# Train a compact tree for visualization (keep your full model for metrics if you want)
dt_shallow = DecisionTreeClassifier(
    max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42
)
dt_shallow.fit(X_train_embeddings, y_train)

feat_names  = [f"sbert_dim_{i}" for i in range(X_train_embeddings.shape[1])]
class_names = [str(c) for c in dt_shallow.classes_]

plt.figure(figsize=(14,10))
plot_tree(
    dt_shallow,
    feature_names=feat_names,
    class_names=class_names,
    filled=True, rounded=True,
    impurity=True, proportion=True,
    fontsize=9
)
plt.tight_layout()
plt.savefig(OUTDIR / "dt_sbert_tree.png", dpi=300)
plt.close()
print("Saved:", OUTDIR / "dt_sbert_tree.png")

Saved: figures/dt_sbert/dt_sbert_tree.png


## Cross validation

In [None]:
cv_scores_nb_sbert = cross_val_score(gnb_model, X_train_embeddings, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_dt_sbert = cross_val_score(dt_model, X_train_embeddings, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_svm_sbert = cross_val_score(svm_model, X_train_embeddings, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_scores_logreg_sbert = cross_val_score(logreg_model, X_train_embeddings, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print("SBERT Cross-Validation Accuracy:\n"
      f"Naive Bayes: {cv_scores_nb_sbert.mean():.4f}, "
      f"Decision Tree: {cv_scores_dt_sbert.mean():.4f}, "
      f"SVM: {cv_scores_svm_sbert.mean():.4f}, "
      f"Logistic Regression: {cv_scores_logreg_sbert.mean():.4f}")


## Comparison of result with sentence embedding (SBERT)

In [None]:
# Dictionary to store SBERT model results
results_sbert = {
    'Model': [],
    'Accuracy': [],
    'Precision (weighted avg)': [],
    'Recall (weighted avg)': [],
    'F1-Score (weighted avg)': []
}

# Helper function to evaluate and store SBERT model results
def evaluate_model_sbert(model_name, model, X_train, y_train, X_val, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Get classification report
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # Store the results
    results_sbert['Model'].append(model_name)
    results_sbert['Accuracy'].append(accuracy_score(y_val, y_pred))
    results_sbert['Precision (weighted avg)'].append(report['weighted avg']['precision'])
    results_sbert['Recall (weighted avg)'].append(report['weighted avg']['recall'])
    results_sbert['F1-Score (weighted avg)'].append(report['weighted avg']['f1-score'])

# 1. Gaussian Naive Bayes
gnb_model = GaussianNB()
evaluate_model_sbert("Gaussian Naive Bayes", gnb_model, X_train_embeddings, y_train, X_val_embeddings, y_val)

# 2. Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_model_sbert("Decision Tree", dt_model, X_train_embeddings, y_train, X_val_embeddings, y_val)

# 3. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
evaluate_model_sbert("Support Vector Machine", svm_model, X_train_embeddings, y_train, X_val_embeddings, y_val)

# 4. Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
evaluate_model_sbert("Logistic Regression", logreg_model, X_train_embeddings, y_train, X_val_embeddings, y_val)

# Convert results dictionary to DataFrame
results_df_sbert = pd.DataFrame(results_sbert)

# Heatmap of results
plt.figure(figsize=(10, 6))
sns.heatmap(results_df_sbert.set_index('Model').T, annot=True, cmap='viridis', linewidths=0.5, fmt=".2f")
plt.title('Comparison of Model Performance with SBERT Embeddings')
plt.show()

# BioBERT model implementation

In [None]:
#pip install transformers

### Import BioBERT and Load Pre-trained Model

In [33]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

### Generating Sentence Embeddings and Tokenization

In [35]:
def get_biobert_embeddings(text_list, tokenizer, model):
    embeddings = []
    for text in text_list:
        # Tokenize the text and generate inputs
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Pass inputs through the BioBERT model
        with torch.no_grad():  # Disable gradient calculation
            outputs = model(**inputs)
        # Extract CLS token embeddings (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

# Generate embeddings for training and validation sets
X_train_biobert = get_biobert_embeddings(X_train.tolist(), tokenizer, model)
X_val_biobert = get_biobert_embeddings(X_val.tolist(), tokenizer, model)


## Train and Evaluate Models

### Naives Bayes

In [37]:
# Train Gaussian Naive Bayes using BioBERT embeddings
gnb_model = GaussianNB()
gnb_model.fit(X_train_biobert, y_train)

# Predict on validation set
y_pred_gnb = gnb_model.predict(X_val_biobert)

# Evaluate performance
print("BioBERT + Gaussian Naive Bayes Performance:")
print(classification_report(y_val, y_pred_gnb))

BioBERT + Gaussian Naive Bayes Performance:
              precision    recall  f1-score   support

           1       0.66      0.53      0.58       479
           2       0.36      0.44      0.40       224
           3       0.28      0.54      0.37       295
           4       0.41      0.67      0.51       520
           5       0.48      0.14      0.22       792

    accuracy                           0.42      2310
   macro avg       0.44      0.46      0.41      2310
weighted avg       0.46      0.42      0.40      2310



In [39]:
OUTDIR = Path("figures/nb_biobert_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "nb_biobert_lime_5classes.png"
TOP_K = 10

# 1) Predict-proba wrapper (raw text -> BioBERT embeddings -> NB proba)
def predict_proba_biobert(raw_texts):
    X = get_biobert_embeddings(list(raw_texts), tokenizer, model)  # uses your function
    return gnb_model.predict_proba(X)

class_names = [str(c) for c in gnb_model.classes_]
explainer = LimeTextExplainer(class_names=class_names)

# 2) Choose one representative validation sample per class
probas = predict_proba_biobert(list(X_val if hasattr(X_val, "__iter__") else X_val.values))
pred_idxs = np.argmax(probas, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:                     # prefer a sample predicted as class i
        chosen.append(indices_by_class[i][0])
    else:
        # fallback: first ground-truth example of this class (string compare is robust)
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 3) Build the 5-subplot figure
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for i, ax in enumerate(axes[:5]):
    idx = chosen[i]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]

    pred_idx = int(np.argmax(predict_proba_biobert([text])[0]))
    exp = explainer.explain_instance(text, predict_proba_biobert, num_features=TOP_K, labels=[pred_idx])
    tok_w = exp.as_list(label=pred_idx)
    terms, weights = zip(*tok_w) if tok_w else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")  # hide empty 6th tile
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved BioBERT LIME figure -> {PNG_PATH}")

Saved BioBERT LIME figure -> figures/nb_biobert_lime/nb_biobert_lime_5classes.png


### Decision Tree

In [111]:
# Train Decision Tree using BioBERT embeddings
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_biobert, y_train)

# Predict on validation set
y_pred_dt = dt_model.predict(X_val_biobert)

# Evaluate performance
print("BioBERT + Decision Tree Performance:")
print(classification_report(y_val, y_pred_dt))

BioBERT + Decision Tree Performance:
              precision    recall  f1-score   support

           1       0.34      0.38      0.36       479
           2       0.11      0.14      0.12       224
           3       0.11      0.13      0.12       295
           4       0.32      0.32      0.32       520
           5       0.20      0.16      0.18       792

    accuracy                           0.23      2310
   macro avg       0.22      0.22      0.22      2310
weighted avg       0.23      0.23      0.23      2310



In [115]:
OUTDIR = Path("figures/dt_biobert")
OUTDIR.mkdir(parents=True, exist_ok=True)

# Compact, interpretable tree (keep your full dt_model for metrics if you want)
dt_shallow = DecisionTreeClassifier(
    max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42
)
dt_shallow.fit(X_train_biobert, y_train)

feat_names  = [f"biobert_dim_{i}" for i in range(X_train_biobert.shape[1])]
class_names = [str(c) for c in dt_shallow.classes_]

plt.figure(figsize=(14,10))
plot_tree(
    dt_shallow,
    feature_names=feat_names,
    class_names=class_names,
    filled=True, rounded=True,
    impurity=True, proportion=True,
    fontsize=9
)
plt.tight_layout()
plt.savefig(OUTDIR / "dt_biobert_tree.png", dpi=300)
plt.close()
print("Saved:", OUTDIR / "dt_biobert_tree.png")

Saved: figures/dt_biobert/dt_biobert_tree.png


### Logistic Regression

In [112]:
# Train Logistic Regression using BioBERT embeddings
logreg_model = LogisticRegression(max_iter=1500, random_state=42)
logreg_model.fit(X_train_biobert, y_train)

# Predict on validation set
y_pred_logreg = logreg_model.predict(X_val_biobert)

# Evaluate performance
print("BioBERT + Logistic Regression Performance:")
print(classification_report(y_val, y_pred_logreg))

BioBERT + Logistic Regression Performance:
              precision    recall  f1-score   support

           1       0.69      0.73      0.71       479
           2       0.48      0.45      0.46       224
           3       0.45      0.43      0.44       295
           4       0.66      0.65      0.65       520
           5       0.46      0.46      0.46       792

    accuracy                           0.56      2310
   macro avg       0.55      0.54      0.55      2310
weighted avg       0.55      0.56      0.55      2310



In [119]:
OUTDIR = Path("figures/logreg_biobert")
OUTDIR.mkdir(parents=True, exist_ok=True)

feature_names = [f"biobert_dim_{i}" for i in range(X_train_biobert.shape[1])]
class_names   = [str(c) for c in logreg_model.classes_]

def plot_logreg_class_weights(class_idx, top_k=10):
    coefs = logreg_model.coef_[class_idx]
    top_pos = np.argsort(coefs)[-top_k:]
    top_neg = np.argsort(coefs)[:top_k]
    top_features = np.hstack([top_neg, top_pos])
    
    plt.figure(figsize=(8,6))
    colors = ["red" if c < 0 else "blue" for c in coefs[top_features]]
    plt.barh(np.array(feature_names)[top_features], coefs[top_features], color=colors)
    plt.title(f"LogReg (BioBERT) – Class {class_names[class_idx]}")
    plt.xlabel("Coefficient")
    plt.tight_layout()
    plt.savefig(OUTDIR / f"logreg_biobert_class{class_names[class_idx]}.png", dpi=300)
    plt.close()

for c in range(len(class_names)):
    plot_logreg_class_weights(c)

print("Saved coefficient plots in:", OUTDIR)

Saved coefficient plots in: figures/logreg_biobert


### SVM (Support Vector Machine)

In [113]:
# Train SVM using BioBERT embeddings
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_biobert, y_train)

# Predict on validation set
y_pred_svm = svm_model.predict(X_val_biobert)

# Evaluate performance
print("BioBERT + SVM Performance:")
print(classification_report(y_val, y_pred_svm))

BioBERT + SVM Performance:
              precision    recall  f1-score   support

           1       0.66      0.75      0.70       479
           2       0.44      0.43      0.44       224
           3       0.44      0.43      0.43       295
           4       0.64      0.66      0.65       520
           5       0.45      0.41      0.43       792

    accuracy                           0.54      2310
   macro avg       0.53      0.54      0.53      2310
weighted avg       0.53      0.54      0.54      2310



In [121]:
svm_cal = CalibratedClassifierCV(estimator=svm_model, cv=5)   # use 'estimator' on sklearn >=1.2
svm_cal.fit(X_train_biobert, y_train)
class_names = [str(c) for c in svm_cal.classes_]

OUTDIR = Path("figures/svm_biobert_lime"); OUTDIR.mkdir(parents=True, exist_ok=True)
PNG_PATH = OUTDIR / "svm_biobert_lime_5classes.png"
TOP_K = 10

explainer = LimeTextExplainer(class_names=class_names)
emb_dim = X_train_biobert.shape[1]

# 1) Adapter: raw text -> BioBERT CLS embedding -> predict_proba
def predict_proba_from_text(texts):
    X = get_biobert_embeddings(list(texts), tokenizer, model)
    X = np.asarray(X).reshape(len(texts), emb_dim)
    return svm_cal.predict_proba(X)

# 2) Pick one representative validation sample per class (prefer model-predicted)
raw_val = list(X_val if hasattr(X_val, "__iter__") else X_val.values)
probas_all = predict_proba_from_text(raw_val)
pred_idxs = np.argmax(probas_all, axis=1)

indices_by_class = {i: [] for i in range(len(class_names))}
for idx, p in enumerate(pred_idxs):
    indices_by_class[p].append(idx)

y_list = list(y_val)
chosen = []
for i in range(len(class_names)):
    if indices_by_class[i]:
        chosen.append(indices_by_class[i][0])
    else:
        # fallback: first ground-truth example of that class
        try:
            j = next(k for k, y in enumerate(y_list) if str(y) == class_names[i])
        except StopIteration:
            j = 0
        chosen.append(j)

# 3) Build & save the 5-panel PNG
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.ravel()

for k, ax in enumerate(axes[:5]):
    idx = chosen[k]
    text = X_val.iloc[idx] if hasattr(X_val, "iloc") else X_val[idx]
    pred_idx = int(np.argmax(predict_proba_from_text([text])[0]))

    exp = explainer.explain_instance(text, predict_proba_from_text, num_features=TOP_K,  labels=[pred_idx])
    items = exp.as_list(label=pred_idx)
    terms, weights = zip(*items) if items else (["(no tokens)"], [0.0])

    ax.barh(list(terms)[::-1], list(weights)[::-1])
    ax.set_title(f"Class {class_names[pred_idx]} — sample {idx}")
    ax.set_xlabel("LIME weight")

axes[-1].axis("off")
plt.tight_layout()
plt.savefig(PNG_PATH, dpi=300)
plt.close()
print(f"Saved SVM+BioBERT LIME figure -> {PNG_PATH}")

Saved SVM+BioBERT LIME figure -> figures/svm_biobert_lime/svm_biobert_lime_5classes.png


## Cross validation

In [None]:
cv_scores_nb_biobert = cross_val_score(gnb_model, X_train_biobert, y_train, cv=5, scoring='accuracy')
cv_scores_dt_biobert = cross_val_score(dt_model, X_train_biobert, y_train, cv=5, scoring='accuracy')
cv_scores_svm_biobert = cross_val_score(svm_model, X_train_biobert, y_train, cv=5, scoring='accuracy')
cv_scores_logreg_biobert = cross_val_score(logreg_model, X_train_biobert, y_train, cv=5, scoring='accuracy')
print("BioBERT Cross-Validation Accuracy:\n"
      f"Naive Bayes: {cv_scores_nb_biobert.mean():.4f}, "
      f"Decision Tree: {cv_scores_dt_biobert.mean():.4f}, "
      f"SVM: {cv_scores_svm_biobert.mean():.4f}, "
      f"Logistic Regression: {cv_scores_logreg_biobert.mean():.4f}")

## Comparison of result with sentence embedding (BioBERT)

In [None]:
# Define the model names and metrics
models = ["Gaussian Naive Bayes", "Decision Tree", "Support Vector Machine", "Logistic Regression"]
metrics = ["Accuracy", "Precision (weighted avg)", "Recall (weighted avg)", "F1-Score (weighted avg)"]


biobert_results = np.array([
    [0.66, 0.34, 0.69, 0.66],  # Accuracy
    [0.46, 0.22, 0.55, 0.53],  # Precision
    [0.40, 0.23, 0.54, 0.54],  # Recall
    [0.40, 0.23, 0.55, 0.54]   # F1-Score
])


plt.figure(figsize=(10, 5))
sns.heatmap(biobert_results, annot=True, cmap="viridis", xticklabels=models, yticklabels=metrics, cbar=True)
plt.title("Comparison of Model Performance with BioBERT Embeddings")
plt.xlabel("Model")
plt.ylabel("Metric")
plt.show()