In [87]:
# Bag of Words transformation of the partitioned abstracts
# This script uses the CountVectorizer from sklearn to transform the partitioned abstracts into a bag of words representation.

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random

count_vect = CountVectorizer()

# Gets the data from Data/partitioned_abstracts.csv and puts it into a pandas dataframe
df = pd.read_csv('Data/Partitioned_Abstracts.csv', sep=',', header=0, encoding='utf-8')

# Add target column to the dataframe based on label
label_mapping = {
    'AI and Supply Chain Management': 0,
    'AI and Finance': 1,
    'AI and Marketing': 2,
    'AI and Economics': 3,
    'AI and Accounting': 4
}
df['target'] = df['Label'].map(label_mapping)

#randomize the order of the dataframe using random.shuffle
random.seed(42)  # For reproducibility
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#remove partitions with less than 100 words
df = df[df['Partitioned Abstract'].str.split().apply(len) >= 100].reset_index(drop=True)


x_train_bow = count_vect.fit_transform(df['Partitioned Abstract'])
df
#x_train_counts.shape

Unnamed: 0,Title,Year,Authors,Label,Partitioned Abstract,target
0,Enhancing Investment Analysis: Optimizing AI-A...,2024,"Han, XW; Wang, N; Che, SK; Yang, HY; Zhang, KP...",AI and Finance,recent years application generative artificial...,1
1,Artificial intelligence applications in supply...,2021,"Pournader, M; Ghaderi, H; Hassanzadegan, A; Fa...",AI and Supply Chain Management,paper presents systematic review studies relat...,0
2,Forecasting disruptions in global food value c...,2023,"Tamasiga, P; Ouassou, E; Onyeaka, H; Bakwena, ...",AI and Supply Chain Management,globalization interconnected supply chains led...,0
3,"AI-powered marketing: What, where, and how?*",2024,"Kumar, V; Ashraf, AR; Nadeem, W",AI and Marketing,artificial intelligence ai become disruptive f...,2
4,Artificial intelligence based decision-making ...,2022,"Lehner, OM; Ittonen, K; Silvola, H; Strm, E; ...",AI and Accounting,trustworthiness using rest's component model a...,4
...,...,...,...,...,...,...
738,"The Big Data, Artificial Intelligence, and Blo...",2022,"Gusc, J; Bosma, P; Jarka, S; Biernat-Jarka, A",AI and Accounting,current energy prices include environmental so...,4
739,The Artificial Intelligence Revolution in Digi...,2023,"Al-Baity, HH",AI and Finance,artificial intelligence ai proliferated last y...,1
740,ROLE OF GENETIC-VARIATION AT THE APO AI-CIII-A...,1993,"XU, CF; ANGELICO, F; DELBEN, M; HUMPHRIES, S",AI and Accounting,cholesterol diet however significant associati...,4
741,Exploring volatility interconnections between ...,2024,"Yousaf, I; Ijaz, MS; Umar, M; Li, YS",AI and Economics,energy artificial intelligence ai two top fiel...,3


In [88]:
count_vect.vocabulary_.get(u'algorithm')

543

In [89]:
#TF-IDF transformation

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_bow)
X_train_tf = tf_transformer.transform(x_train_bow)
X_train_tf.shape


(743, 7810)

In [90]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(x_train_bow)
X_train_tfidf.shape

(743, 7810)

In [91]:
# Doc2vec transformation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt_tab")

tagged_data = [
    TaggedDocument(
        words=word_tokenize(row["Partitioned Abstract"].lower()), tags=[str(i)]
    )
    for i, row in df.iterrows()
]
model = Doc2Vec(vector_size=100, min_count=2, epochs=100, workers=4, window=5)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Get the document vectors
X_train_doc_vectors = [model.dv[str(i)] for i in range(len(tagged_data))]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/etienne/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [92]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

task = "classification"

embeddings = model.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings.shape

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

torch.Size([743, 1024])

In [224]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
import pprint
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import os
import numpy as np
import datetime

def trainSVM(partition, label, folds, model):
    
    clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), svm.SVC(kernel='sigmoid', C=1, probability=True))
    
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)
  
    label_pred_proba = cross_val_predict(clf,partition, label, cv=cv, method='predict_proba')

    label_pred_test = np.argmax(label_pred_proba, axis=1)

    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)
    
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - SVM & {model}')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/svm_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()    

    # Metrics
    results = {
        "Results": {
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }

    return results

def trainNB(partition, label, folds, model):
    
    clf = make_pipeline(MultinomialNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    
    label_pred_proba = cross_val_predict(clf,partition, label, cv=cv, method='predict_proba')

    label_pred_test = np.argmax(label_pred_proba, axis=1)
    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)

    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - MultinomialNB & {model}')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/nb_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close()

    # Metrics
    results = {
       
        "Results": {
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }

    return results
def trainGNB(partition, label, folds, model):
    
    clf = make_pipeline(GaussianNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    label_pred_proba = cross_val_predict(clf,partition, label, cv=cv, method='predict_proba')

    label_pred_test = np.argmax(label_pred_proba, axis=1)

    classes = np.unique(label)
    label_bin = label_binarize(label, classes=classes)
    
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - GaussianNB & {model}')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/GNB_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close() 

    # Metrics
    results = {
        "Results":{
            "accuracy": accuracy_score(label, label_pred_test),
            "precision": precision_score(label, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }

    return results

def train_embedding_classifier(X, y, folds, model):
       
    clf = LogisticRegression(solver='liblinear') 
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    label_pred_proba = cross_val_predict(clf,X, y, cv=cv, method='predict_proba')

    label_pred_test = np.argmax(label_pred_proba, axis=1)

    classes = np.unique(y)
    label_bin = label_binarize(y, classes=classes)
    
    plt.figure()
    for i, class_label in enumerate(classes):
        fpr, tpr, _ = roc_curve(label_bin[:, i], label_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'OvR ROC Curve - Embeddings and LogisticRegression')
    plt.legend(loc='lower right')

    if not os.path.exists('roc_curves'):
        os.makedirs('roc_curves')
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'roc_curves/embeddings_{model}_{timestamp}.jpg'
    plt.savefig(filename)
    plt.close() 

    # Metrics
    results = {
        "Results": {
            "accuracy": accuracy_score(y, label_pred_test),
            "precision": precision_score(y, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(y, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(y, label_pred_test, average='weighted', zero_division=0),
            "roc_curve_file": filename
        }
    }

    return results


#Call the function
print("---Training using SVM---")
print("Bag of words:")
pprint.pp(trainSVM(x_train_bow, df["target"], 10, "BOW"))
print("TFIDF: ")
pprint.pp(trainSVM(X_train_tfidf, df["target"], 10, "TFIDF"))
print("Doc2Vec:")
pprint.pp(trainSVM(X_train_doc_vectors, df["target"], 10, "Doc2Vec"))
print("---Training using NaiveBayes---")
print("Bag of words:")
pprint.pp(trainNB(x_train_bow, df["target"], 10, "BOW"))
print("TFIDF: ")
pprint.pp(trainNB(X_train_tfidf, df["target"], 10, "TFIDF"))
print("Doc2Vec:")
pprint.pp(trainGNB(X_train_doc_vectors, df["target"], 10, "Doc2Vec"))
print("--Training using embeddings and logistic regression---")
pprint.pp(train_embedding_classifier(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], 10, "EmbeddingsLR"))
    




---Training using SVM---
Bag of words:
{'Results': {'accuracy': 0.6069986541049798,
             'precision': 0.6103125827277036,
             'recall': 0.6069986541049798,
             'f1': 0.608062141234796,
             'roc_curve_file': 'roc_curves/svm_BOW_20250527_144923.jpg'}}
TFIDF: 
{'Results': {'accuracy': 0.6056527590847914,
             'precision': 0.6104897886282105,
             'recall': 0.6056527590847914,
             'f1': 0.6073613082564034,
             'roc_curve_file': 'roc_curves/svm_TFIDF_20250527_144941.jpg'}}
Doc2Vec:
{'Results': {'accuracy': 0.7187079407806191,
             'precision': 0.7153610915513793,
             'recall': 0.7187079407806191,
             'f1': 0.7167232981584807,
             'roc_curve_file': 'roc_curves/svm_Doc2Vec_20250527_144941.jpg'}}
---Training using NaiveBayes---
Bag of words:
{'Results': {'accuracy': 0.7442799461641992,
             'precision': 0.737457537898254,
             'recall': 0.7442799461641992,
             'f1': 

In [None]:
from sklearn.linear_model import LogisticRegression
import shap
import numpy as np
import matplotlib.pyplot as plt

# Final model
X = embeddings.to(dtype=torch.float32).cpu().numpy()
y = df["target"]

model = LogisticRegression(solver='liblinear')
model.fit(X, y)

# Use LinearExplainer for linear models
explainer = shap.Explainer(model, X, feature_perturbation="interventional")

# Compute SHAP values
shap_values = explainer(X)

# Summary plot
shap.summary_plot(shap_values, X, show=False)

# Save plot
if not os.path.exists("shap_outputs"):
    os.makedirs("shap_outputs")

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
plt.savefig(f"shap_outputs/shap_summary_{timestamp}.jpg")
plt.close()