In [1]:
# Bag of Words transformation of the partitioned abstracts
# This script uses the CountVectorizer from sklearn to transform the partitioned abstracts into a bag of words representation.

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random

count_vect = CountVectorizer()

# Gets the data from Data/partitioned_abstracts.csv and puts it into a pandas dataframe
df = pd.read_csv('Data/Partitioned_Abstracts.csv', sep=',', header=0, encoding='utf-8')

# Add target column to the dataframe based on label
# This is used to reduce processing times
label_mapping = {
    'AI and Supply Chain Management': 0,
    'AI and Finance': 1,
    'AI and Marketing': 2,
    'AI and Economics': 3,
    'AI and Accounting': 4
}
df['target'] = df['Label'].map(label_mapping)

#randomize the order of the dataframe using random.shuffle
random.seed(42)  # For reproducibility
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#remove partitions with less than 100 words
#This step can be added or removed to test accuracy with samples that are shorter than others.
df = df[df['Partitioned Abstract'].str.split().apply(len) >= 100].reset_index(drop=True)


#Transforming to Bag of Words.
x_train_bow = count_vect.fit_transform(df['Partitioned Abstract'])
df
#x_train_counts.shape

Unnamed: 0,Title,Year,Authors,Label,Partitioned Abstract,target
0,Enhancing Investment Analysis: Optimizing AI-A...,2024,"Han, XW; Wang, N; Che, SK; Yang, HY; Zhang, KP...",AI and Finance,recent years application generative artificial...,1
1,Artificial intelligence applications in supply...,2021,"Pournader, M; Ghaderi, H; Hassanzadegan, A; Fa...",AI and Supply Chain Management,paper presents systematic review studies relat...,0
2,Forecasting disruptions in global food value c...,2023,"Tamasiga, P; Ouassou, E; Onyeaka, H; Bakwena, ...",AI and Supply Chain Management,globalization interconnected supply chains led...,0
3,"AI-powered marketing: What, where, and how?*",2024,"Kumar, V; Ashraf, AR; Nadeem, W",AI and Marketing,artificial intelligence ai become disruptive f...,2
4,Artificial intelligence based decision-making ...,2022,"Lehner, OM; Ittonen, K; Silvola, H; Strm, E; ...",AI and Accounting,trustworthiness using rest's component model a...,4
...,...,...,...,...,...,...
738,"The Big Data, Artificial Intelligence, and Blo...",2022,"Gusc, J; Bosma, P; Jarka, S; Biernat-Jarka, A",AI and Accounting,current energy prices include environmental so...,4
739,The Artificial Intelligence Revolution in Digi...,2023,"Al-Baity, HH",AI and Finance,artificial intelligence ai proliferated last y...,1
740,ROLE OF GENETIC-VARIATION AT THE APO AI-CIII-A...,1993,"XU, CF; ANGELICO, F; DELBEN, M; HUMPHRIES, S",AI and Accounting,cholesterol diet however significant associati...,4
741,Exploring volatility interconnections between ...,2024,"Yousaf, I; Ijaz, MS; Umar, M; Li, YS",AI and Economics,energy artificial intelligence ai two top fiel...,3


In [2]:
#TF transformation (NO IDF)
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_bow)
X_train_tf = tf_transformer.transform(x_train_bow)
X_train_tf.shape


(743, 7810)

In [3]:
#TF-IDF Transformation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(x_train_bow)
X_train_tfidf.shape

(743, 7810)

In [13]:
# Doc2vec transformation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt_tab")

tagged_data = [
    TaggedDocument(
        words=word_tokenize(row["Partitioned Abstract"].lower()), tags=[str(i)]
    )
    for i, row in df.iterrows()
]
model = Doc2Vec(vector_size=100, min_count=2, epochs=100, workers=4, window=5)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Get the document vectors
X_train_doc_vectors = [model.dv[str(i)] for i in range(len(tagged_data))]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shepo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
from sentence_transformers import SentenceTransformer

#Embedding transformation using jina embeddings v3
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

task = "classification"

embeddings = model.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings.shape


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

torch.Size([743, 1024])

In [47]:
model2 = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)


embeddings_2 = model2.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings_2.shape

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [00:01<00:00, 19.09it/s]


torch.Size([743, 384])

In [35]:
model3 = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", trust_remote_code=True)
embeddings_3 = model3.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings_3.shape

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [00:07<00:00,  3.28it/s]


torch.Size([743, 768])

In [44]:
model4 = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2", trust_remote_code=True)
embeddings_4 = model4.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings_4.shape

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 32.25it/s]


torch.Size([743, 384])

In [50]:
model5 = SentenceTransformer("allenai/specter2_base", trust_remote_code=True)
embeddings_5 = model5.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings_5.shape

No sentence-transformers model found with name allenai/specter2_base. Creating a new one with mean pooling.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [00:06<00:00,  3.54it/s]


torch.Size([743, 768])

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
import pprint
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import os
import numpy as np
import datetime

# Train an SVM classifier and generate evaluation metrics and ROC curves
def trainSVM(partition, label, folds, model):
    # Build a pipeline with standard scaling and SVM with sigmoid kernel
    clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), svm.SVC(kernel='sigmoid', C=1, probability=True))
    
    # Create k-fold cross-validator
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)
    
    # Perform cross-validated prediction to get class probabilities
    label_pred_proba = cross_val_predict(clf, partition, label, cv=cv, method='predict_proba')
    
    # Convert probabilities to predicted class labels
    label_pred_test = np.argmax(label_pred_proba, axis=1)

    # Generate ROC curve for each class
    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    # Plot diagonal line for random classifier
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - SVM & {model}')
    plt.legend(loc='lower right')

    # Save the ROC curve image
    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/svm_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()

    # Return evaluation metrics
    results = {
        "Results": {
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }
    return results

# Train a Multinomial Naive Bayes classifier and evaluate it
def trainNB(partition, label, folds, model):
    # Create pipeline with MultinomialNB (no scaling needed)
    clf = make_pipeline(MultinomialNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    label_pred_proba = cross_val_predict(clf, partition, label, cv=cv, method='predict_proba')
    label_pred_test = np.argmax(label_pred_proba, axis=1)

    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)

    # Plot ROC curve
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - MultinomialNB & {model}')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/nb_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()

    # Return evaluation metrics
    results = {
        "Results": {
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }
    return results

# Train a Gaussian Naive Bayes classifier and evaluate it
def trainGNB(partition, label, folds, model):
    clf = make_pipeline(GaussianNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    label_pred_proba = cross_val_predict(clf, partition, label, cv=cv, method='predict_proba')
    label_pred_test = np.argmax(label_pred_proba, axis=1)

    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)

    # Plot ROC curve
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - GaussianNB & {model}')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/GNB_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()

    # Return evaluation metrics
    results = {
        "Results": {
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }
    return results




# Call the function
print("---Testing using SVM---")
print("Bag of words:")
pprint.pp(trainSVM(x_train_bow, df["target"], 10, "BOW"))
print("TFIDF: ")
pprint.pp(trainSVM(X_train_tfidf, df["target"], 10, "TFIDF"))
print("Doc2Vec:")
pprint.pp(trainSVM(X_train_doc_vectors, df["target"], 10, "Doc2Vec"))
print("---Testing using NaiveBayes---")
print("Bag of words:")
pprint.pp(trainNB(x_train_bow, df["target"], 10, "BOW"))
print("TFIDF: ")
pprint.pp(trainNB(X_train_tfidf, df["target"], 10, "TFIDF"))
print("Doc2Vec:")
pprint.pp(trainGNB(X_train_doc_vectors, df["target"], 10, "Doc2Vec"))




---Testing using SVM---
Bag of words:
{'Results': {'accuracy': 0.6056527590847914,
             'precision': 0.6086747022927794,
             'recall': 0.6056527590847914,
             'f1': 0.6066801808574992,
             'roc_curve_file': 'roc_curves/svm_BOW_20250528_050310.jpg'}}
TFIDF: 
{'Results': {'accuracy': 0.6056527590847914,
             'precision': 0.6097571247856168,
             'recall': 0.6056527590847914,
             'f1': 0.6070027166979564,
             'roc_curve_file': 'roc_curves/svm_TFIDF_20250528_050335.jpg'}}
Doc2Vec:
{'Results': {'accuracy': 0.7362045760430687,
             'precision': 0.7363639463824524,
             'recall': 0.7362045760430687,
             'f1': 0.736054417961173,
             'roc_curve_file': 'roc_curves/svm_Doc2Vec_20250528_050336.jpg'}}
---Testing using NaiveBayes---
Bag of words:
{'Results': {'accuracy': 0.7442799461641992,
             'precision': 0.737457537898254,
             'recall': 0.7442799461641992,
             'f1': 0.

In [118]:
# Assuming these are your result dictionaries from previous runs:
res1 = train_embedding_classifier(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR_JinAI")
res2 = train_embedding_classifier(embeddings_2.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR_all-miniLm-L6-v2")
res3 = train_embedding_classifier(embeddings_3.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR_all-mpnet-base")
res4 = train_embedding_classifier(embeddings_4.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR_paraphrase-MiniLM-L3-v2")
res5 = train_embedding_classifier(embeddings_5.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR_specter2")

print("--Testing using embeddings (JinAI) and logistic regression---")
pprint.pp(res1)
print("--Testing using embeddings (all-miniLm-L6-v2) and logistic regression---")
pprint.pp(res2)
print("--Testing using embeddings (all-mpnet-base) and logistic regression---")
pprint.pp(res3)
print("--Testing using embeddings (paraphrase-MiniLM-L3-v2) and logistic regression---")
pprint.pp(res4)
print("--Testing using embeddings (allenai/specter2) and logistic regression---")
pprint.pp(res5)

import matplotlib.pyplot as plt



# Collect accuracy and F1 results
results_accuracy = [
    ("JinAI", res1["Results"]["accuracy"]),
    ("all-miniLm-L6-v2", res2["Results"]["accuracy"]),
    ("all-mpnet-base", res3["Results"]["accuracy"]),
    ("paraphrase-MiniLM-L3-v2", res4["Results"]["accuracy"]),
    ("specter2", res5["Results"]["accuracy"])
]

results_f1 = [
    ("JinAI", res1["Results"]["f1"]),
    ("all-miniLm-L6-v2", res2["Results"]["f1"]),
    ("all-mpnet-base", res3["Results"]["f1"]),
    ("paraphrase-MiniLM-L3-v2", res4["Results"]["f1"]),
    ("specter2", res5["Results"]["f1"])
]

# --- Plot Accuracy with Labels ---
labels, accuracies = zip(*results_accuracy)

plt.figure(figsize=(10,6), dpi=150)  # Higher DPI for sharper image
bars = plt.bar(labels, accuracies, color='teal')

# Add value labels (rounded to 3 decimal places)
plt.bar_label(bars, labels=[f"{acc:.3f}" for acc in accuracies], padding=5, fontsize=10)

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Accuracy of Logistic Regression Across Different Embedding Models')
plt.xticks(rotation=30, ha='right')
plt.tight_layout()

# Save the figure
if not os.path.exists('Embedding Models'):
    os.makedirs('Embedding Models')

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'Embedding Models/Accuracy_Embedding_Models_{timestamp}.jpg'
plt.savefig(filename, dpi=300)  # Higher DPI for sharper image
plt.close()




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

--Testing using embeddings (JinAI) and logistic regression---
{'Results': {'accuracy': 0.8250336473755047,
             'precision': 0.8316393717984946,
             'recall': 0.8250336473755047,
             'f1': 0.8258820743562051,
             'roc_curve_file': 'roc_curves/embeddings_EmbeddingsLR_JinAI_20250528_050634.jpg'}}
--Testing using embeddings (all-miniLm-L6-v2) and logistic regression---
{'Results': {'accuracy': 0.8048452220726783,
             'precision': 0.8081797632323277,
             'recall': 0.8048452220726783,
             'f1': 0.8051420716274219,
             'roc_curve_file': 'roc_curves/embeddings_EmbeddingsLR_all-miniLm-L6-v2_20250528_050634.jpg'}}
--Testing using embeddings (all-mpnet-base) and logistic regression---
{'Results': {'accuracy': 0.8088829071332436,
             'precision': 0.8139378552038442,
             'recall': 0.8088829071332436,
             'f1': 0.8096349418093295,
             'roc_curve_file': 'roc_curves/embeddings_EmbeddingsLR_all-m

In [113]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

def test_logistic_C(X, y, C_values,  model, folds=10):
    accuracy_scores = []
    
    for C_val in C_values:
        clf = LogisticRegression(C=C_val, solver='lbfgs', max_iter=1000)
        cv = KFold(n_splits=folds, shuffle=True, random_state=42)
        label_pred = cross_val_predict(clf, X, y, cv=cv)
        accuracy = accuracy_score(y, label_pred)
        accuracy_scores.append(accuracy)
        print(f"C={C_val}: Accuracy={accuracy:.4f}")

    # Bar chart
    plt.figure(figsize=(8, 6))
    plt.bar([str(C) for C in C_values], accuracy_scores, color='skyblue')
    plt.ylim(0, 1)
    plt.xlabel("C value")
    plt.ylabel("Accuracy")
    plt.title("Accuracy vs. C in Logistic Regression")
    if not os.path.exists('hyperParameter_tuning'):
        os.makedirs('hyperParameter_tuning')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'hyperParameter_tuning/GNB_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()


In [99]:
def test_logistic_solver(X, y, solvers,  model,folds=10):
    accuracy_scores = []
    
    for solver in solvers:
        clf = LogisticRegression(solver=solver, max_iter=1000)
        cv = KFold(n_splits=folds, shuffle=True, random_state=42)
        label_pred = cross_val_predict(clf, X, y, cv=cv)
        accuracy = accuracy_score(y, label_pred)
        accuracy_scores.append(accuracy)
        print(f"Solver={solver}: Accuracy={accuracy:.7f}")

    # Bar chart
    plt.figure(figsize=(8, 6))
    plt.bar(solvers, accuracy_scores, color='lightgreen')
    plt.ylim(0, 1)
    plt.xlabel("Solver")
    plt.ylabel("Accuracy")
    plt.title("Accuracy vs. Solver in Logistic Regression")
    if not os.path.exists('hyperParameter_tuning'):
        os.makedirs('hyperParameter_tuning')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'hyperParameter_tuning/GNB_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()


In [102]:
def test_logistic_penalty(X, y, penalties, model,folds=10):
    accuracies = []
    for penalty in penalties:
        clf = LogisticRegression(solver='saga', penalty=penalty, max_iter=1000)  # 'saga' supports all penalties
        cv = KFold(n_splits=folds, shuffle=True, random_state=42)
        y_pred = cross_val_predict(clf, X, y, cv=cv)
        acc = accuracy_score(y, y_pred)
        accuracies.append(acc)
        print(f"Penalty={penalty}: Accuracy={acc:.4f}")

    plt.bar(penalties, accuracies, color='orange')
    plt.ylim(0, 1)
    plt.title('Accuracy vs Penalty')
    plt.ylabel('Accuracy')
    if not os.path.exists('hyperParameter_tuning'):
        os.makedirs('hyperParameter_tuning')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'hyperParameter_tuning/GNB_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()


In [105]:
c_values = [0.01, 0.1, 1, 10, 15, 20]
test_logistic_C(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], c_values,"EmbeddingsLR_JinAI", 10)
solvers = ['liblinear', 'lbfgs', 'sag', 'saga', 'newton-cg']
test_logistic_solver(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], solvers,"EmbeddingsLR_JinAI", 10)
penalties=['l1', 'l2']
test_logistic_penalty(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], penalties,"EmbeddingsLR_JinAI", 10)

C=0.01: Accuracy=0.6285
C=0.1: Accuracy=0.7981
C=1: Accuracy=0.8250
C=10: Accuracy=0.8129
C=15: Accuracy=0.8048
C=20: Accuracy=0.8008
Solver=liblinear: Accuracy=0.8223419
Solver=lbfgs: Accuracy=0.8250336
Solver=sag: Accuracy=0.8250336
Solver=saga: Accuracy=0.8250336
Solver=newton-cg: Accuracy=0.8250336
Penalty=l1: Accuracy=0.7699
Penalty=l2: Accuracy=0.8250


In [115]:
import shap
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import datetime

def explain_embedding_classifier_with_shap(embeddings, labels):
    # Convert embeddings to numpy
    X = embeddings.to(dtype=torch.float32).cpu().numpy()
    y = labels

    # Train Logistic Regression on all data
    model = LogisticRegression(solver='lbfgs')
    model.fit(X, y)

    # Create SHAP explainer for linear model
    explainer = shap.Explainer(model, X, feature_perturbation="interventional")

    # Compute SHAP values
    shap_values = explainer(X)

    # Generate feature names: dim_0, dim_1, ..., dim_n
    feature_names = [f"dim_{i}" for i in range(X.shape[1])]

    # Plot and save SHAP summary plot
    shap.summary_plot(shap_values, X, feature_names=feature_names, show=False)

    # Save the plot
    if not os.path.exists("shap_outputs"):
        os.makedirs("shap_outputs")
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"shap_outputs/shap_summary_{timestamp}.jpg"
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Leaves space at top for title
    plt.title(f'SHAP - Champion Model')
    plt.savefig(filename)
    plt.close()

    print(f"SHAP summary plot saved to: {filename}")
    

    return shap_values

shap_values = explain_embedding_classifier_with_shap(embeddings, df["target"])




SHAP summary plot saved to: shap_outputs/shap_summary_20250528_050145.jpg
