In [None]:
import importlib
import Embeddings
importlib.reload(Embeddings)
from Embeddings import Embedders_Five
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load data
user_stories = pd.read_excel("../CanWeTrustReFAIR/Dataset/Domain_Classification_Data/Synthetic User Stories.xlsx")
user_stories['Domain'] = user_stories['Domain'].str.lower()

# Create embedder instance
embedder = Embedders_Five(user_stories["User Story"])

# Encode labels
label_encoder = LabelEncoder()
data_y = label_encoder.fit_transform(user_stories["Domain"])
print("Number of labels:", data_y.shape)
domains_names = np.unique(user_stories["Domain"])
print("Unique domains:", domains_names)
#-----------------------------------------------------------------
# Test TFIDF
print("\n=== TFIDF Results ===")
data_x = embedder.getTFIDFEmbeddings()
print("Number of documents:", data_x.shape[0]) 
print("Number of features:", data_x.shape[1])

# Split data for TFIDF
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42)
X_train_dense = X_train
X_test_dense = X_test

# LazyClassifier for TFIDF
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_tfidf, predictions_tfidf = clf.fit(X_train_dense, X_test_dense, y_train, y_test)
print("\nTFIDF Models Performance:")
print(models_tfidf)

#-----------------------------------------------------------------
# Test BERT tokenization approach
print("\n=== BERT Results ===")
bert_features = embedder.getBERTEmbeddings()
print("BERT features shape:", bert_features.shape)
print("BERT features type:", type(bert_features))
print("BERT features dtype:", bert_features.dtype)

# Convert to float for ML compatibility if needed
bert_features = bert_features.astype(np.float32)

# Split data for BERT
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    bert_features, data_y, test_size=0.2, random_state=42
)

print("Training data shape:", X_train_bert.shape)
print("Test data shape:", X_test_bert.shape)

# LazyClassifier for BERT
clf_bert = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_bert, predictions_bert = clf_bert.fit(X_train_bert, X_test_bert, y_train_bert, y_test_bert)
print("\nBERT Models Performance:")
print(models_bert)

# Compare best models
print("\n=== Performance Comparison ===")
print("Best TFIDF Model:", models_tfidf.iloc[0])
print("Best BERT Model:", models_bert.iloc[0])

Number of labels: (12401,)
Unique domains: ['biology' 'cardiology' 'computer networks' 'computer vision' 'demography'
 'demograpy' 'dermatology' 'economics' 'education' 'endocrinology'
 'finance & marketing' 'health' 'information systems' 'law' 'library'
 'linguistics' 'literature' 'medicine' 'movies' 'music' 'nephrology'
 'news' 'pediatrics' 'pharmacology' 'plant science' 'political science'
 'psychology' 'psycology' 'radiology' 'social media' 'social networks'
 'social work' 'sociology' 'sport' 'transportation' 'urban studies']

=== TFIDF Results ===
Number of documents: 12401
Number of features: 100


 97%|█████████▋| 28/29 [01:21<00:04,  4.97s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20669
[LightGBM] [Info] Number of data points in the train set: 9920, number of used features: 100
[LightGBM] [Info] Start training from score -3.525554
[LightGBM] [Info] Start training from score -3.546316
[LightGBM] [Info] Start training from score -3.578291
[LightGBM] [Info] Start training from score -3.539348
[LightGBM] [Info] Start training from score -3.535882
[LightGBM] [Info] Start training from score -8.103696
[LightGBM] [Info] Start training from score -3.498526
[LightGBM] [Info] Start training from score -3.563954
[LightGBM] [Info] Start training from score -3.505215
[LightGBM] [Info] Start training from score -3.465736
[LightGBM] [Info] Start training from score -3.505215
[LightGBM] [Info] Start training from score -3.54631

100%|██████████| 29/29 [01:29<00:00,  3.09s/it]



TFIDF Models Performance:
                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      0.80               0.80    None      0.80   
CalibratedClassifierCV             0.80               0.80    None      0.80   
ExtraTreesClassifier               0.79               0.80    None      0.79   
SVC                                0.79               0.80    None      0.79   
LinearSVC                          0.79               0.80    None      0.79   
RandomForestClassifier             0.79               0.80    None      0.79   
LogisticRegression                 0.79               0.79    None      0.79   
LinearDiscriminantAnalysis         0.79               0.79    None      0.78   
RidgeClassifierCV                  0.77               0.77    None      0.76   
RidgeClassifier                    0.77               0.77    None      0.76   
BaggingClassi

  7%|▋         | 2/29 [00:03<00:53,  1.97s/it]