In [None]:
import importlib
import Embeddings
importlib.reload(Embeddings)
from Embeddings import Embedders_Five
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import urllib.request
import os
import zipfile
import gensim
import gensim.downloader as api
import fasttext
import fasttext.util
# download fastText
fastext_path = "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/cc.en.300.bin"

if not os.path.exists(fastext_path):
    fasttext.util.download_model('en', if_exists='ignore')  # English


#Download Word2Vec model
word2vec_path = "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/word2vec-google-news-300.bin"

if not os.path.exists(word2vec_path):
    word2vec_model = api.load('word2vec-google-news-300')
    word2vec_model.save_word2vec_format('word2vec-google-news-300.bin', binary=True)

# Download GloVe vectors
glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
zip_path = "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/glove.6B.zip"
glove_txt = "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/glove.6B.100d.txt"
glove_word2vec = "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/glove.6B.100d.word2vec"

if not os.path.exists(zip_path):
    print("Downloading GloVe vectors...")
    urllib.request.urlretrieve(glove_url, zip_path)
    
    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/')
    print("Download and extraction complete")
    
    # Convert to Word2Vec format
    print("Converting to Word2Vec format...")
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove2word2vec(glove_txt, glove_word2vec)
    print("Conversion complete")
else:
    if not os.path.exists(glove_word2vec):
        print("Converting to Word2Vec format...")
        from gensim.scripts.glove2word2vec import glove2word2vec
        glove2word2vec(glove_txt, glove_word2vec)
        print("Conversion complete")
    print("Files already exist")



# Load data
user_stories = pd.read_excel("/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/Dataset/Domain_Classification_Data/Synthetic User Stories.xlsx")
user_stories['Domain'] = user_stories['Domain'].str.lower()

# Create embedder instance
embedder = Embedders_Five(user_stories["User Story"])

# Encode labels
label_encoder = LabelEncoder()
data_y = label_encoder.fit_transform(user_stories["Domain"])
print("Number of labels:", data_y.shape)
domains_names = np.unique(user_stories["Domain"])
print("Unique domains:", domains_names)

# Domains Test

In [None]:
#-----------------------------------------------------------------
# Test FastText
print("\n=== FastText Results ===")
fasttext_features = embedder.getFastTextEmbedding()
# Split data for FastText
X_train_fasttext, X_test_fasttext, y_train_fasttext, y_test_fasttext = train_test_split(
    fasttext_features, data_y, test_size=0.2, random_state=42
)
# LazyClassifier for FastText
clf_fasttext = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_fasttext, predictions_fasttext = clf_fasttext.fit(X_train_fasttext, X_test_fasttext, y_train_fasttext, y_test_fasttext)
print("\nFastText Models Performance:")
print(models_fasttext)

#-----------------------------------------------------------------
# Test GloVe
print("\n=== GloVe Results ===")
glove_features = embedder.getGloVEEmbedding()
# Split data for GloVe
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(
    glove_features, data_y, test_size=0.2, random_state=42
)
# LazyClassifier for GloVe
clf_glove = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_glove, predictions_glove = clf_glove.fit(X_train_glove, X_test_glove, y_train_glove, y_test_glove)
print("\nGloVe Models Performance:")
print(models_glove)

#-----------------------------------------------------------------
# Test TFIDF
print("\n=== TFIDF Results ===")
data_x = embedder.getTFIDFEmbeddings()
# Split data for TFIDF
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42)
X_train_dense = X_train
X_test_dense = X_test
# LazyClassifier for TFIDF
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_tfidf, predictions_tfidf = clf.fit(X_train_dense, X_test_dense, y_train, y_test)
print("\nTFIDF Models Performance:")
print(models_tfidf)

#-----------------------------------------------------------------
# Test BERT tokenization approach
print("\n=== BERT Results ===")
bert_features = embedder.getBERTEmbeddings()
# Convert to float for ML compatibility if needed
bert_features = bert_features.astype(np.float32)
# Split data for BERT
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    bert_features, data_y, test_size=0.2, random_state=42
)
# LazyClassifier for BERT
clf_bert = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_bert, predictions_bert = clf_bert.fit(X_train_bert, X_test_bert, y_train_bert, y_test_bert)
print("\nBERT Models Performance:")
print(models_bert)

#-----------------------------------------------------------------
# Test Word2Vec
print("\n=== Word2Vec Results ===")
w2v_features = embedder.getWord2VecEmbedding()
# Split data for Word2Vec
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    w2v_features, data_y, test_size=0.2, random_state=42
)
# LazyClassifier for Word2Vec
clf_w2v = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_w2v, predictions_w2v = clf_w2v.fit(X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v)
print("\nWord2Vec Models Performance:")
print(models_w2v)

# Compare best models
print("\n=== Performance Comparison ===")
print("Best FastText Model:", models_fasttext.iloc[0])
print("Best TFIDF Model:", models_tfidf.iloc[0])
print("Best BERT Model:", models_bert.iloc[0])
print("Best Word2Vec Model:", models_w2v.iloc[0])
print("Best GloVe Model:", models_glove.iloc[0])

# Sensitive Features test

In [None]:
from SensitiveFeaturesMapping import SensitiveFeaturesMapper
import importlib
import Embeddings
importlib.reload(Embeddings)
from Embeddings import Embedders_Five
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import urllib.request
import os
import zipfile
import gensim.downloader as api
import fasttext
import fasttext.util


# Load data
user_stories = pd.read_excel("/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/Dataset/Domain_Classification_Data/Synthetic User Stories.xlsx")
user_stories['Domain'] = user_stories['Domain'].str.lower()
ontology = SensitiveFeaturesMapper("/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/Dataset/Feature_Extraction/domains-features-mapping.csv",
                                   "/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/Dataset/Feature_Extraction/tasks-features-mapping.csv")


labels = pd.read_excel("/Users/ahmed/Desktop/CanWeTrustReFAIR/CanWeTrustReFAIR/Dataset/ML_Tasks_Classification_Data/Keyword labelled.xlsx", header=None)
labels[2] = labels[2].apply(lambda x: x.lower())
categories_column = []
for row in labels.iterrows():
    current_labels = []
    for label in row[1][3:]:
        if isinstance(label, str):
            current_labels.append(label.lower())
    categories_column.append(current_labels)
labels["Categories array"] = categories_column
labels[[2, "Categories array"]]

target = []
counter = 0
for row in user_stories.iterrows():
    target.append(labels[labels[2]==row[1]["Machine Learning Task"].lower()]["Categories array"].values[0])
    counter += 1
user_stories["Target"] = target
user_stories[["User Story","Target"]]

ontology.get_sensitive_features(user_stories["Target"][10000],user_stories["Domain"][10000])


{'domain': {'name': 'transportation', 'features': ['geography', 'race']},
 'tasks': {'classification': ['age',
   'geography',
   'race',
   'sex',
   'synthetic',
   'birthplace',
   'citizenship',
   'disability',
   'ethnicity',
   'family size',
   'family wealth',
   'gender',
   'other sensitive annotations may be present in synsets from the person subtree',
   "people's gender",
   'race (inferred)',
   'skin tone',
   'skin type',
   'textual reference to people and their demographics',
   'textual references to people and their demographics',
   'race/ethnicity',
   'religion',
   'sexual orientation'],
  'regression': ['age',
   'ethnicity',
   'financial status',
   'gender',
   'geography',
   'race',
   'sex'],
  'ranking': ['activity',
   'age',
   'and gender',
   'birth category',
   'ethnic group',
   'gender',
   'geography',
   'news provider',
   'ownership',
   'race',
   'sex',
   'tour availability'],
  'representation learning': ['age',
   'and gender',
   'ethn