<a href="https://colab.research.google.com/github/AlessandraParziale/Fairness-Thesis/blob/main/GERMAN_CREDIT_DATASET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GERMAN CREDIT**

[Questo set di dati ha 20 funzionalità e 1000 righe. Gli attributi protetti sono gender_status ed age (>25 è privilegiato). Classifica la decisione di credito come rischio di credito buono o cattivo.]

## ***Classification***

### Loading the Data

In [22]:
import pandas as pd
import os
import requests

from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [23]:
DATA_DIR = "/content/GERMAN_CREDIT-Classification"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def ensure_download(url, fname):
    """
    Controlla se il dataset è già presente nella directory dei dati.
    Lo scarica se non presente.
    """
    fpath = f"{DATA_DIR}/{fname}"
    if not os.path.isfile(fpath):
        response = requests.get(url)
        with open(fpath, 'wb') as file:
            file.write(response.content)

def load_german():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
    fname = "german.csv"
    ensure_download(url, fname)


    fpath = f"{DATA_DIR}/{fname}"

    cols = [
        "status", "duration", "credit_history",
        "purpose", "credit_amount", "savings", "employment",
        "installment_rate", "gender_status",
        "other_debtors", "residence_since", "property", "age",
        "installment_plans", "housing", "existing_credits",
        "skill_level", "people_liable", "telephone",
        "foreign_worker", "label"
    ]
    df = pd.read_csv(fpath, header=None, names=cols, delim_whitespace=True)

    df = df.dropna()

    gender_status = {
        "A91": "male_divorced_separated",
        "A92": "female_divorced_separated_married",
        "A93": "male_single",
        "A94": "male_married_widowed",
        "A95": "female_single"
    }



    df['gender_status'] = df['gender_status'].map(gender_status)

    y = df['label']
    X = df.drop('label', axis=1)

    y.replace({1: 0, 2: 1}, inplace=True)

    print(y.unique())

    print(df)
    num_colonne_X = df.shape[1]
    print("Numero di colonne nel DataFrame X:", num_colonne_X)

    return X, y

X, y = load_german()


[0 1]
    status  duration credit_history purpose  credit_amount savings employment  \
0      A11         6            A34     A43           1169     A65        A75   
1      A12        48            A32     A43           5951     A61        A73   
2      A14        12            A34     A46           2096     A61        A74   
3      A11        42            A32     A42           7882     A61        A74   
4      A11        24            A33     A40           4870     A61        A73   
..     ...       ...            ...     ...            ...     ...        ...   
995    A14        12            A32     A42           1736     A61        A74   
996    A11        30            A32     A41           3857     A61        A73   
997    A14        12            A32     A43            804     A61        A75   
998    A11        45            A32     A43           1845     A61        A73   
999    A12        45            A34     A41           4576     A62        A71   

     installment_rate

In [24]:
# Applicazione label encoding a tutte le colonne categoriche


categorical_cols = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X.head())

   status  duration  credit_history  purpose  credit_amount  savings  \
0       0         6               4        4           1169        4   
1       1        48               2        4           5951        0   
2       3        12               4        7           2096        0   
3       0        42               2        3           7882        0   
4       0        24               3        0           4870        0   

   employment  installment_rate  gender_status  other_debtors  \
0           4                 4              3              0   
1           2                 2              0              0   
2           3                 2              3              0   
3           3                 2              3              2   
4           2                 3              3              0   

   residence_since  property  age  installment_plans  housing  \
0                4         0   67                  2        1   
1                2         0   22             

---
### **Random Forest Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Precision
precision = precision_score(y_test, y_pred)

# Recall
recall = recall_score(y_test, y_pred)

# CAccuracy
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix Random Forest:", conf_matrix)
print("Accuracy Random Forest:", accuracy)
print("Precision Random Forest:", precision)
print("Recall Random Forest:", recall)

Confusion Matrix Random Forest: [[132   9]
 [ 29  30]]
Accuracy Random Forest: 0.81
Precision Random Forest: 0.7692307692307693
Recall Random Forest: 0.5084745762711864


---

### **Logistic Regression Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(random_state=42, max_iter=1000)

logistic_regression.fit(X_train, y_train)

y_pred_lr = logistic_regression.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)

# Precision
precision_lr = precision_score(y_test, y_pred_lr)

# Recall
recall_lr = recall_score(y_test, y_pred_lr)

# Accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print("Confusion Matrix Logistic Regression:", conf_matrix_lr)
print("Accuracy Logistic Regression:", accuracy_lr)
print("Precision Logistic Regression:", precision_lr)
print("Recall Logistic Regression:", recall_lr)

Confusion Matrix Logistic Regression: [[127  14]
 [ 31  28]]
Accuracy Logistic Regression: 0.775
Precision Logistic Regression: 0.6666666666666666
Recall Logistic Regression: 0.4745762711864407


---
### **eXtreme Gradient Boosting Model (XGBoost)**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

# Precision
precision_xgb = precision_score(y_test, y_pred_xgb)

# Recall
recall_xgb = recall_score(y_test, y_pred_xgb)

# Accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("Confusion Matrix XGBoost:", conf_matrix_xgb)
print("Accuracy XGBoost:", accuracy_xgb)
print("Precision XGBoost:", precision_xgb)
print("Recall XGBoost:", recall_xgb)

Confusion Matrix XGBoost: [[125  16]
 [ 24  35]]
Accuracy XGBoost: 0.8
Precision XGBoost: 0.6862745098039216
Recall XGBoost: 0.5932203389830508


---
### **Decision Tree**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred_dt = clf.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)

# Precision
precision_dt = precision_score(y_test, y_pred_dt)

# Recall
recall_dt = recall_score(y_test, y_pred_dt)

# Accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print("Confusion Matrix XGBoost:", conf_matrix_dt)
print("Accuracy XGBoost:", accuracy_dt)
print("Precision XGBoost:", precision_dt)
print("Recall XGBoost:", recall_dt)

Confusion Matrix XGBoost: [[120  21]
 [ 36  23]]
Accuracy XGBoost: 0.715
Precision XGBoost: 0.5227272727272727
Recall XGBoost: 0.3898305084745763
