##### Master Degree in Computer Science and Data Science for Economics

# Example of biases in data classification

### Alfio Ferrara

In [1]:
import os
import numpy as np
import pandas as pd


import os

%cd /content/
!ls -R
repo_name = "NLP_COURSE"

# !rm -rf .git/lfs
# !git lfs prune
# !sudo apt-get update
# !sudo apt-get install git-lfs
# !git lfs install

#!rm -rf {repo_name}
if not os.path.exists(repo_name):
    print(f"Directory {repo_name} does not exist, proceeding with clone.")
    !git clone https://github.com/Abudo-S/NLP_COURSE.git

%cd NLP_COURSE

/content
.:
sample_data

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md
Error reading git config: error running /usr/lib/git-core/git 'rev-parse' '--is-bare-repository': 'fatal: not a git repository (or any of the parent directories): .git' 'exit status 128'
Prune error: Git can't resolve ref: "HEAD"

Errors logged to lfs/logs/20250826T204226.911627575.log
Use `git lfs logs last` to view the log.
Prune error: Git can't resolve ref: "HEAD"

Errors logged to lfs/logs/20250826T204226.916010885.log
Use `git lfs logs last` to view the log.
Prune error: error in git log: exit status 128 fatal: not a git repository (or any of the parent directories): .git


Errors logged to lfs/logs/20250826T204226.91869895.log
Use `git lfs logs last` to view the log.
Prune sub-tasks failed, cannot continue
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github

In [15]:
#!git lfs logs last
!pwd
data_df = pd.read_excel(os.path.join("/content/NLP_COURSE/data", "biased-cv-doc.xlsx"), index_col=0)
data_df.head(10)

/content


Unnamed: 0,text,target,class_name
0,The candidate is a female coming from usa and ...,2,highly suitable
1,The candidate is a female coming from italy an...,0,not suitable
2,The candidate is a male coming from france and...,2,highly suitable
3,The candidate is a male coming from italy and ...,0,not suitable
4,The candidate is a male coming from italy and ...,0,not suitable
5,The candidate is a female coming from italy an...,0,not suitable
6,The candidate is a male coming from italy and ...,0,not suitable
7,The candidate is a male coming from germany an...,2,highly suitable
8,The candidate is a male coming from spain and ...,1,moderately suitable
9,The candidate is a female coming from usa and ...,1,moderately suitable


In [16]:
print(data_df.loc[0].text)

The candidate is a female coming from usa and has 2 years of esperience in the field.
    The language proficiency level is beginner. The technical proficiency level is proficient.


In [17]:
data_df.class_name.unique()

array(['highly suitable', 'not suitable', 'moderately suitable'],
      dtype=object)

Copy of file **bert_classifier** to be trained in colab gpus.

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import os

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


in_folder = "/content/NLP_COURSE/data"
filename = "biased-cv-doc.xlsx"
docs = pd.read_excel(os.path.join(in_folder, filename))
data = [x for x in docs['text'].values]
labels = [x for x in docs['target'].values]

train_texts, val_texts, train_labels, val_labels = train_test_split(data, labels,
                                                                    test_size=0.2,
                                                                    random_state=42)

MAX_LEN = 256
BATCH_SIZE = 8
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'used device: {device}')
model.to(device)

# Training
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

EPOCHS = 4
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_accuracy = eval_model(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Save model

save_path = "/content/NLP_COURSE/nlp/nlp/bert_text_classifier"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


used device: cuda
Epoch 1/4
Train Loss: 0.7960, Validation Accuracy: 0.7300
Epoch 2/4
Train Loss: 0.6106, Validation Accuracy: 0.7300
Epoch 3/4
Train Loss: 0.6084, Validation Accuracy: 0.7200
Epoch 4/4
Train Loss: 0.5945, Validation Accuracy: 0.7200


('/content/NLP_COURSE/nlp/nlp/bert_text_classifier/tokenizer_config.json',
 '/content/NLP_COURSE/nlp/nlp/bert_text_classifier/special_tokens_map.json',
 '/content/NLP_COURSE/nlp/nlp/bert_text_classifier/vocab.txt',
 '/content/NLP_COURSE/nlp/nlp/bert_text_classifier/added_tokens.json')

The model trained for this example can be found in [bert_classifier](./nlp/bert_classifier.py). Use it to train the model and save the outcome on you local folders.

In [18]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

!pwd
# [pretrained model] Substitute with your path
MODEL_PATH = "/content/NLP_COURSE/nlp/nlp/bert_text_classifier" #"bert-base-uncased" for remote model
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# 2. Funzione per fare previsioni
def predict(texts, model, tokenizer, device, max_len=256):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.append(preds.item())
    return predictions


/content


In [19]:
sample = data_df['text'].values
expected_label = data_df['target'].values
data_to_predict = sample[:6]
y_true = expected_label[:6]

predicted_labels = predict(data_to_predict, model, tokenizer, device)

for i, (text, label) in enumerate(zip(data_to_predict, predicted_labels)):
    yt = y_true[i]
    print(f"Text: {text}\nPredicted Label: {label}\nTrue Label: {yt}\n")


Text: The candidate is a female coming from usa and has 2 years of esperience in the field.
    The language proficiency level is beginner. The technical proficiency level is proficient.
Predicted Label: 2
True Label: 2

Text: The candidate is a female coming from italy and has 2 years of esperience in the field.
    The language proficiency level is proficient. The technical proficiency level is beginner.
Predicted Label: 0
True Label: 0

Text: The candidate is a male coming from france and has 4 years of esperience in the field.
    The language proficiency level is intermediate. The technical proficiency level is proficient.
Predicted Label: 2
True Label: 2

Text: The candidate is a male coming from italy and has 4 years of esperience in the field.
    The language proficiency level is upper-intermediate. The technical proficiency level is intermediate.
Predicted Label: 0
True Label: 0

Text: The candidate is a male coming from italy and has 5 years of esperience in the field.
    T

## Stats

In [20]:
from sklearn.metrics import classification_report

In [21]:
sample = data_df['text'].values
y_true = data_df['target'].values
y_pred = predict(sample, model, tokenizer, device)
print(classification_report(y_true, y_pred, zero_division=0))

KeyboardInterrupt: 

### Analysis of outcome

In [None]:
from collections import defaultdict

In [None]:
tab = pd.read_excel(os.path.join("/content/NLP_COURSE/data", "biased-cv-tab.xlsx"), index_col=0)
tab.head(2)

In [None]:
data.head(2)

Unnamed: 0,text,target,class_name
0,The candidate is a female coming from usa and ...,2,highly suitable
1,The candidate is a female coming from italy an...,0,not suitable


In [None]:
genders = defaultdict(lambda: defaultdict(lambda: 0))
country = defaultdict(lambda: defaultdict(lambda: 0))
lang = defaultdict(lambda: defaultdict(lambda: 0))
tech = defaultdict(lambda: defaultdict(lambda: 0))
stats = [genders, country, lang, tech]

for i, pred in enumerate(y_pred):
    record = tab.iloc[i]
    for j, x in enumerate(record.values[:-1]):
        stats[j][pred][x] += 1
S = [pd.DataFrame(x) for x in stats]

In [None]:
def show(df):
    return np.round(df.fillna(0) / df.sum(axis=0), 2) * 100

#### Let's check for different features and classes, where:

**Target**: 0 => not suitable, 1 => moderately suitable, 2 => highly suitable

**Features**: 0 => gender, 1 => country, 2 => language, 3 => tech

In [None]:
target = 0
feature = 1
print(len([x for x in y_pred if x == target]) / len(y_pred))
print(round(100 * S[feature] / S[feature].sum(axis=0), 2))
print(round(100 * S[feature].T / S[feature].T.sum(axis=0), 2))

0.413
             2      0      1
usa      24.28    NaN  23.55
france   27.16    NaN  27.33
spain    25.10    NaN  26.45
germany  23.46    NaN  22.67
italy      NaN  100.0    NaN
     usa  france  spain  germany  italy
2  42.14   41.25  40.13    42.22    NaN
0    NaN     NaN    NaN      NaN  100.0
1  57.86   58.75  59.87    57.78    NaN


In [None]:
show(S[1])

Unnamed: 0,2,0,1
usa,24.0,0.0,24.0
france,27.0,0.0,27.0
spain,25.0,0.0,26.0
germany,23.0,0.0,23.0
italy,0.0,100.0,0.0


In [None]:
show(S[2])

Unnamed: 0,2,0,1
beginner,23.0,25.0,30.0
intermediate,27.0,27.0,22.0
proficient,25.0,26.0,21.0
upper-intermediate,24.0,22.0,27.0


In [None]:
show(S[3])

Unnamed: 0,2,0,1
proficient,78.0,13.0,0.0
beginner,19.0,45.0,0.0
upper-intermediate,3.0,18.0,58.0
intermediate,0.0,24.0,42.0


## Born
It's a supervised learning algorithm that works by calculating the transition probability of a document's "wave function" collapsing into a target class's "wave function." This probability, derived from the Born rule, is then used to make a classification decision. The algorithm is based on the idea that text can be represented as a superposition of words, and a class can be represented similarly.

In [22]:
!pip install bornrule
from bornrule import BornClassifier
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

Collecting bornrule
  Downloading bornrule-0.3.2-py3-none-any.whl.metadata (1.3 kB)
Downloading bornrule-0.3.2-py3-none-any.whl (25 kB)
Installing collected packages: bornrule
Successfully installed bornrule-0.3.2


In [23]:
documents = data_df.text.values
vectorizer = CountVectorizer(tokenizer=word_tokenize, token_pattern=None) #word_tokenize can be replaced by spacy tokenizer
X = vectorizer.fit_transform(documents)
y = data_df.target.values

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
born = BornClassifier()

In [None]:
born.fit(X, y)
b_pred = born.predict(X)



In [None]:
print(classification_report(y_true, b_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90       337
           1       0.66      0.39      0.49       334
           2       0.58      0.69      0.63       329

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.67      1000
weighted avg       0.69      0.69      0.67      1000



In [None]:
features = vectorizer.get_feature_names_out()
E = pd.DataFrame(born.explain().toarray(), index=features)

In [None]:
E.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1,2
italy,0.078608,0.027204,0.026003
beginner,0.011133,0.0078,0.00718
proficient,0.006782,0.006532,0.009688
upper-intermediate,0.003173,0.004392,0.003928
intermediate,0.001825,0.001994,0.001597
1,0.001494,0.00143,0.001818
2,0.000662,0.000649,0.00057
5,0.000506,0.000446,0.000456
4,0.000373,0.000417,0.000378
female,0.000222,0.000207,0.000219
