In [87]:
# Bag of Words transformation of the partitioned abstracts
# This script uses the CountVectorizer from sklearn to transform the partitioned abstracts into a bag of words representation.

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random

count_vect = CountVectorizer()

# Gets the data from Data/partitioned_abstracts.csv and puts it into a pandas dataframe
df = pd.read_csv('Data/Partitioned_Abstracts.csv', sep=',', header=0, encoding='utf-8')

# Add target column to the dataframe based on label
label_mapping = {
    'AI and Supply Chain Management': 0,
    'AI and Finance': 1,
    'AI and Marketing': 2,
    'AI and Economics': 3,
    'AI and Accounting': 4
}
df['target'] = df['Label'].map(label_mapping)

#randomize the order of the dataframe using random.shuffle
random.seed(42)  # For reproducibility
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#remove partitions with less than 100 words
df = df[df['Partitioned Abstract'].str.split().apply(len) >= 100].reset_index(drop=True)


x_train_bow = count_vect.fit_transform(df['Partitioned Abstract'])
df
#x_train_counts.shape

Unnamed: 0,Title,Year,Authors,Label,Partitioned Abstract,target
0,Enhancing Investment Analysis: Optimizing AI-A...,2024,"Han, XW; Wang, N; Che, SK; Yang, HY; Zhang, KP...",AI and Finance,recent years application generative artificial...,1
1,Artificial intelligence applications in supply...,2021,"Pournader, M; Ghaderi, H; Hassanzadegan, A; Fa...",AI and Supply Chain Management,paper presents systematic review studies relat...,0
2,Forecasting disruptions in global food value c...,2023,"Tamasiga, P; Ouassou, E; Onyeaka, H; Bakwena, ...",AI and Supply Chain Management,globalization interconnected supply chains led...,0
3,"AI-powered marketing: What, where, and how?*",2024,"Kumar, V; Ashraf, AR; Nadeem, W",AI and Marketing,artificial intelligence ai become disruptive f...,2
4,Artificial intelligence based decision-making ...,2022,"Lehner, OM; Ittonen, K; Silvola, H; Strm, E; ...",AI and Accounting,trustworthiness using rest's component model a...,4
...,...,...,...,...,...,...
738,"The Big Data, Artificial Intelligence, and Blo...",2022,"Gusc, J; Bosma, P; Jarka, S; Biernat-Jarka, A",AI and Accounting,current energy prices include environmental so...,4
739,The Artificial Intelligence Revolution in Digi...,2023,"Al-Baity, HH",AI and Finance,artificial intelligence ai proliferated last y...,1
740,ROLE OF GENETIC-VARIATION AT THE APO AI-CIII-A...,1993,"XU, CF; ANGELICO, F; DELBEN, M; HUMPHRIES, S",AI and Accounting,cholesterol diet however significant associati...,4
741,Exploring volatility interconnections between ...,2024,"Yousaf, I; Ijaz, MS; Umar, M; Li, YS",AI and Economics,energy artificial intelligence ai two top fiel...,3


In [88]:
count_vect.vocabulary_.get(u'algorithm')

543

In [89]:
#TF-IDF transformation

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_bow)
X_train_tf = tf_transformer.transform(x_train_bow)
X_train_tf.shape


(743, 7810)

In [90]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(x_train_bow)
X_train_tfidf.shape

(743, 7810)

In [91]:
# Doc2vec transformation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt_tab")

tagged_data = [
    TaggedDocument(
        words=word_tokenize(row["Partitioned Abstract"].lower()), tags=[str(i)]
    )
    for i, row in df.iterrows()
]
model = Doc2Vec(vector_size=100, min_count=2, epochs=100, workers=4, window=5)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Get the document vectors
X_train_doc_vectors = [model.dv[str(i)] for i in range(len(tagged_data))]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/etienne/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [92]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

task = "classification"

embeddings = model.encode(
    df["Partitioned Abstract"].tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings.shape

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

torch.Size([743, 1024])

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
import pprint
import torch

def trainSVM(partition, label, folds):
    partition_train, partition_test, label_train, label_test = train_test_split(
        partition, label, test_size=0.4, random_state=42, stratify=label
    )

    clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), svm.SVC(kernel='sigmoid', C=1))
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    # Cross-validation on training data
    label_pred_cv = cross_val_predict(clf, partition_train, label_train, cv=cv)

    # Fit final model and predict on test set
    clf.fit(partition_train, label_train)
    label_pred_test = clf.predict(partition_test)

    # Metrics
    results = {
        "train": {
            "accuracy": accuracy_score(label_train, label_pred_cv),
            "precision": precision_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "recall": recall_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "f1": f1_score(label_train, label_pred_cv, average='weighted', zero_division=0),
        },
        "test": {
            "accuracy": accuracy_score(label_test, label_pred_test),
            "precision": precision_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label_test, label_pred_test, average='weighted', zero_division=0),
        }
    }

    return results

def trainNB(partition, label, folds):
    partition_train, partition_test, label_train, label_test = train_test_split(
        partition, label, test_size=0.4, random_state=42, stratify=label
    )

    clf = make_pipeline(MultinomialNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    # Cross-validation on training data
    label_pred_cv = cross_val_predict(clf, partition_train, label_train, cv=cv)

    # Fit final model and predict on test set
    clf.fit(partition_train, label_train)
    label_pred_test = clf.predict(partition_test)

    # Metrics
    results = {
        "train": {
            "accuracy": accuracy_score(label_train, label_pred_cv),
            "precision": precision_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "recall": recall_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "f1": f1_score(label_train, label_pred_cv, average='weighted', zero_division=0),
        },
        "test": {
            "accuracy": accuracy_score(label_test, label_pred_test),
            "precision": precision_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label_test, label_pred_test, average='weighted', zero_division=0),
        }
    }

    return results
def trainGNB(partition, label, folds):
    partition_train, partition_test, label_train, label_test = train_test_split(
        partition, label, test_size=0.4, random_state=42, stratify=label
    )

    clf = make_pipeline(GaussianNB())
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    # Cross-validation on training data
    label_pred_cv = cross_val_predict(clf, partition_train, label_train, cv=cv)

    # Fit final model and predict on test set
    clf.fit(partition_train, label_train)
    label_pred_test = clf.predict(partition_test)

    # Metrics
    results = {
        "train": {
            "accuracy": accuracy_score(label_train, label_pred_cv),
            "precision": precision_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "recall": recall_score(label_train, label_pred_cv, average='weighted', zero_division=0),
            "f1": f1_score(label_train, label_pred_cv, average='weighted', zero_division=0),
        },
        "test": {
            "accuracy": accuracy_score(label_test, label_pred_test),
            "precision": precision_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(label_test, label_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(label_test, label_pred_test, average='weighted', zero_division=0),
        }
    }

    return results

def train_embedding_classifier(X, y, folds=10):
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    clf = LogisticRegression(solver='liblinear')  # You can try other solvers too
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)

    # Cross-validation predictions on training data
    y_pred_cv = cross_val_predict(clf, X_train, y_train, cv=cv)

    # Final fit + test set prediction
    clf.fit(X_train, y_train)
    y_pred_test = clf.predict(X_test)

    # Metrics
    metrics = {
        "train": {
            "accuracy": accuracy_score(y_train, y_pred_cv),
            "precision": precision_score(y_train, y_pred_cv, average='weighted', zero_division=0),
            "recall": recall_score(y_train, y_pred_cv, average='weighted', zero_division=0),
            "f1": f1_score(y_train, y_pred_cv, average='weighted', zero_division=0),
        },
        "test": {
            "accuracy": accuracy_score(y_test, y_pred_test),
            "precision": precision_score(y_test, y_pred_test, average='weighted', zero_division=0),
            "recall": recall_score(y_test, y_pred_test, average='weighted', zero_division=0),
            "f1": f1_score(y_test, y_pred_test, average='weighted', zero_division=0),
        }
    }

    return metrics


#Call the function
print("---Training using SVM---")
print("Bag of words:")
pprint.pp(trainSVM(x_train_bow, df["target"], 10))
print("TFIDF: ")
pprint.pp(trainSVM(X_train_tfidf, df["target"], 10))
print("Doc2Vec:")
pprint.pp(trainSVM(X_train_doc_vectors, df["target"], 10))
print("---Training using NaiveBayes---")
print("Bag of words:")
pprint.pp(trainNB(x_train_bow, df["target"], 10))
print("TFIDF: ")
pprint.pp(trainNB(X_train_tfidf, df["target"], 10))
print("Doc2Vec:")
pprint.pp(trainGNB(X_train_doc_vectors, df["target"], 10))
print("--Training using embeddings and logistic regression---")
pprint.pp(train_embedding_classifier(embeddings.to(dtype=torch.float32).cpu().numpy(), df["target"], 10))
    




---Training using SVM---
Bag of words:
{'train': {'accuracy': 0.5280898876404494,
           'precision': 0.5543246537930471,
           'recall': 0.5280898876404494,
           'f1': 0.5070830537544314},
 'test': {'accuracy': 0.5604026845637584,
          'precision': 0.6172395740585587,
          'recall': 0.5604026845637584,
          'f1': 0.5470042661266374}}
TFIDF: 
{'train': {'accuracy': 0.5258426966292135,
           'precision': 0.5474942855247366,
           'recall': 0.5258426966292135,
           'f1': 0.5091650500591278},
 'test': {'accuracy': 0.5671140939597316,
          'precision': 0.6000971220721266,
          'recall': 0.5671140939597316,
          'f1': 0.5543097423253789}}
Doc2Vec:
{'train': {'accuracy': 0.7191011235955056,
           'precision': 0.7168506642952999,
           'recall': 0.7191011235955056,
           'f1': 0.7169635524146218},
 'test': {'accuracy': 0.7214765100671141,
          'precision': 0.7188820490926718,
          'recall': 0.721476510067114

In [130]:
from sklearn.model_selection import train_test_split

# Use Bag of Words features for train/test split
X = x_train_bow
y = df["target"]

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

print(x_train_bow.shape)
print(y.shape)

(743, 7810)
(743,)


In [123]:
from sklearn.model_selection import cross_val_score
from sklearn import svm

clf = svm.SVC(kernel="linear", C=1).fit(x_train, y_train)
clf.score(x_test, y_test)

0.6933333333333334

In [124]:
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel="linear", C=1)
scores = cross_val_score(clf, x_train, y_train, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.06)


In [125]:
from sklearn.model_selection import ShuffleSplit

n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
cross_val_score(clf, X, y, cv=cv)

array([0.8       , 0.78666667, 0.69333333, 0.77333333, 0.78666667,
       0.82666667, 0.78666667, 0.8       , 0.8       , 0.78666667])

In [126]:
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.1, random_state=0)

scaler = preprocessing.StandardScaler(with_mean=False).fit(X_train)
x_train_transformed = scaler.transform(X_train)

clf = svm.SVC(C=1).fit(x_train_transformed, y_train)
x_test_transformed = scaler.transform(X_test)

clf.score(x_test_transformed, y_test)

0.44

In [127]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict


clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), svm.SVC(C=1))
cv = StratifiedKFold(n_splits=10)
cross_val_score(clf, X, y, cv=cv)

scores = []
scores = cross_val_score

# Perform cross-validation and get predictions for each fold


y_pred = cross_val_predict(clf, X, y, cv=cv)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted', zero_division=0)
recall = recall_score(y, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y, y_pred, average='weighted', zero_division=0)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")



Accuracy:  0.3917
Precision: 0.5179
Recall:    0.3917
F1 Score:  0.3828
