<a href="https://colab.research.google.com/github/AlessandraParziale/Fairness-Thesis/blob/main/COMPAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **COMPAS**

[Una versione ridotta del Datset COMPAS con 8 funzioni e 6907 righe. Utilizzato per prevedere se un imputato penale avrà una recidiva.]

## ***Classification***

### Loading the Data


In [None]:
import pandas as pd
import xgboost as xgb
import os
import requests

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score


In [None]:
DATA_DIR = "/content/COMPAS-Classification"


if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def ensure_download(url, fname):
    fpath = f"{DATA_DIR}/{fname}"
    if not os.path.isfile(fpath):
        response = requests.get(url)
        with open(fpath, 'wb') as file:
            file.write(response.content)

def load_compas():
    url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
    fname = "compas-scores-two-years.csv"
    ensure_download(url, fname)

    fpath = f"{DATA_DIR}/{fname}"
    data = pd.read_csv(fpath, delimiter=",")

    # Selezione colonne
    data = data[["sex", "age", "age_cat", "race", "c_charge_degree", "priors_count", "days_b_screening_arrest", "decile_score", "is_recid"]]

    data = data.dropna()

    data["is_recid"] = data["is_recid"].astype("category")

    X = data[["sex", "age", "age_cat", "race", "c_charge_degree", "priors_count", "days_b_screening_arrest", "decile_score"]]
    y = data["is_recid"]

    print(data)
    num_colonne_X = data.shape[1]
    print("Numero di colonne nel DataFrame X:", num_colonne_X)

    return X, y

# Carica i dati
X, y = load_compas()

         sex  age          age_cat              race c_charge_degree  \
0       Male   69  Greater than 45             Other               F   
1       Male   34          25 - 45  African-American               F   
2       Male   24     Less than 25  African-American               F   
5       Male   44          25 - 45             Other               M   
6       Male   41          25 - 45         Caucasian               F   
...      ...  ...              ...               ...             ...   
7209    Male   23     Less than 25  African-American               F   
7210    Male   23     Less than 25  African-American               F   
7211    Male   57  Greater than 45             Other               F   
7212  Female   33          25 - 45  African-American               M   
7213  Female   23     Less than 25          Hispanic               F   

      priors_count  days_b_screening_arrest  decile_score is_recid  
0                0                     -1.0             1        0

In [None]:
# Applicazione label encoding a tutte le colonne categoriche in X
categorical_cols = X.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X.head())

   sex  age  age_cat  race  c_charge_degree  priors_count  \
0    1   69        1     5                0             0   
1    1   34        0     0                0             0   
2    1   24        2     0                0             4   
5    1   44        0     5                1             0   
6    1   41        0     2                0            14   

   days_b_screening_arrest  decile_score  
0                     -1.0             1  
1                     -1.0             3  
2                     -1.0             4  
5                      0.0             1  
6                     -1.0             6  


---
### **Random Forest Model**



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Precision
precision = precision_score(y_test, y_pred)

# Recall
recall = recall_score(y_test, y_pred)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix Random Forest:", conf_matrix)
print("Accuracy Random Forest:", accuracy)
print("Precision Random Forest:", precision)
print("Recall Random Forest:", recall)

Numero di elementi in y_test: 1382
Numero di elementi in y_pred: 1382
Confusion Matrix Random Forest: [[448 240]
 [238 456]]
Accuracy Random Forest: 0.6541244573082489
Precision Random Forest: 0.6551724137931034
Recall Random Forest: 0.6570605187319885




---


### **Logistic Regression Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(random_state=42, max_iter=1000)

logistic_regression.fit(X_train, y_train)

# Fai previsioni sul set di test
y_pred_lr = logistic_regression.predict(X_test)


In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)

# Precision
precision_lr = precision_score(y_test, y_pred_lr)

# Recall
recall_lr = recall_score(y_test, y_pred_lr)

# Accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print("Confusion Matrix Logistic Regression [Classification]:", conf_matrix_lr)
print("Accuracy Logistic Regression [Classification]:", accuracy_lr)
print("Precision Logistic Regression [Classification]:", precision_lr)
print("Recall Logistic Regression [Classification]:", recall_lr)

Confusion Matrix Logistic Regression: [[498 190]
 [235 459]]
Accuracy Logistic Regression: 0.6924746743849494
Precision Logistic Regression: 0.7072419106317411
Recall Logistic Regression: 0.6613832853025937


---
### **eXtreme Gradient Boosting Model (XGBoost)**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)


In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

# Precision
precision_xgb = precision_score(y_test, y_pred_xgb)

# Recall
recall_xgb = recall_score(y_test, y_pred_xgb)

# Accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("Confusion Matrix XGBoost:", conf_matrix_xgb)
print("Accuracy XGBoost:", accuracy_xgb)
print("Precision XGBoost:", precision_xgb)
print("Recall XGBoost:", recall_xgb)

Confusion Matrix XGBoost: [[472 216]
 [254 440]]
Accuracy XGBoost: 0.6599131693198264
Precision XGBoost: 0.6707317073170732
Recall XGBoost: 0.6340057636887608


## ***Risk Assessment***

### Loading the Data

In [None]:
import pandas as pd
import os
import requests

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [None]:
DATA_DIR = "/content/COMPAS-RiskAssessment"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def ensure_download(url, fname):
    fpath = f"{DATA_DIR}/{fname}"
    if not os.path.isfile(fpath):
        response = requests.get(url)
        with open(fpath, 'wb') as file:
            file.write(response.content)

def load_compas():
    url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
    fname = "compas-scores-two-years.csv"
    ensure_download(url, fname)

    fpath = f"{DATA_DIR}/{fname}"
    data = pd.read_csv(fpath, delimiter=",")

    data = data[["sex", "age", "age_cat", "race", "c_charge_degree", "priors_count", "days_b_screening_arrest", "decile_score", "is_recid"]]

    data = data.dropna()

    data["is_recid"] = data["is_recid"].astype("category")

    X = data[["sex", "age", "age_cat", "race", "c_charge_degree", "priors_count", "days_b_screening_arrest", "decile_score"]]
    y = data["is_recid"]

    return X, y

X, y = load_compas()

In [None]:
# Applicazione label encoding a tutte le colonne categoriche in X
categorical_cols = X.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X.head())

   sex  age  age_cat  race  c_charge_degree  priors_count  \
0    1   69        1     5                0             0   
1    1   34        0     0                0             0   
2    1   24        2     0                0             4   
5    1   44        0     5                1             0   
6    1   41        0     2                0            14   

   days_b_screening_arrest  decile_score  
0                     -1.0             1  
1                     -1.0             3  
2                     -1.0             4  
5                      0.0             1  
6                     -1.0             6  



---


### **Logistic Regression Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(random_state=42, max_iter=1000)

logistic_regression.fit(X_train, y_train)

y_pred_lr_ra = logistic_regression.predict(X_test)


In [None]:
# Valutazione modello

# Precision
precision_lr_ra = precision_score(y_test, y_pred_lr_ra)

# Recall
recall_lr_ra = recall_score(y_test, y_pred_lr_ra)

# Accuracy
accuracy_lr_ra = accuracy_score(y_test, y_pred_lr_ra)


print("Accuracy Logistic Regression [Risk Assessment]:", accuracy_lr_ra)
print("Precision Logistic Regression [Risk Assessment]:", precision_lr_ra)
print("Recall Logistic Regression [Risk Assessment]:", recall_lr_ra)

Accuracy Logistic Regression [Risk Assessment]: 0.6924746743849494
Precision Logistic Regression [Risk Assessment]: 0.7072419106317411
Recall Logistic Regression [Risk Assessment]: 0.6613832853025937
