In [1]:
# step 0 - import delle librerie necessarie
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import fetch_openml

In [9]:
# step 1 - Load Data
# carico il dataset "credit-g" da OpenML
data = fetch_openml(name='credit-g', version=2,as_frame=True)
df = data.frame # converto in DataFrame di pandas
# faccio il display delle prime righe del dataset
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4,male single,none,4,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951.0,<100,1<=X<4,2,female div/dep/mar,none,2,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096.0,<100,4<=X<7,2,male single,none,3,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2,male single,guarantor,4,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870.0,<100,1<=X<4,3,male single,none,4,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [12]:
# Controllo le caratteristiche del dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   int64   
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   float64 
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   int64   
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   int64   
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   int64   
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

**Attribute description**:
- Status of existing checking account, in Deutsche Mark.
- Duration in months
- Credit history (credits taken, paid back duly, delays, critical accounts)
- Purpose of the credit (car, television,...)
- Credit amount
- Status of savings account/bonds, in Deutsche Mark.
- Present employment, in number of years.
- Installment rate in percentage of disposable income
- Personal status (married, single,...) and sex
- Other debtors / guarantors
- Present residence since X years
- Property (e.g. real estate)
- Age in years
- Other installment plans (banks, stores)
- Housing (rent, own,...)
- Number of existing credits at this bank
- Job
- Number of people being liable to provide maintenance for
- Telephone (yes,no)
- Foreign worker (yes,no)

In [10]:
# step 2 - Data Cleaning
# controllo la presenza di valori nulli
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64


In [None]:
# step 3 - Data Preprocessing
# divido il dataset in categorical e numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64'])

print("Numerical features:")
print(numerical_features.columns)

categorical_features = df.select_dtypes(include=['category'])
print("Categorical features:")
print(categorical_features.columns)

Numerical features:
Index(['duration', 'credit_amount', 'installment_commitment',
       'residence_since', 'age', 'existing_credits', 'num_dependents'],
      dtype='object')
Categorical features:
Index(['checking_status', 'credit_history', 'purpose', 'savings_status',
       'employment', 'personal_status', 'other_parties', 'property_magnitude',
       'other_payment_plans', 'housing', 'job', 'own_telephone',
       'foreign_worker', 'class'],
      dtype='object')


In [18]:
# step 5 - Feature Engeneering
# converto le categorical in variabili numeriche tramite one-hot encoding
def encode_categorical(cat):
    for col in cat.columns:
        unique_vals = cat[col].unique()
        mapping = {val: idx for idx, val in enumerate(unique_vals)}
        cat[col] = cat[col].map(mapping)
    return cat

cat_encoded = encode_categorical(categorical_features)

In [19]:
# display delle prime righe delle categorical encode
cat_encoded.head()

Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker,class
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0,0,0,0,0,1,0,1
2,2,0,1,1,2,0,0,0,0,0,1,1,0,0
3,0,1,2,1,2,0,1,1,0,1,0,1,0,0
4,0,2,3,1,1,0,0,2,0,1,0,1,0,1


In [20]:
# ora ricombino le 2 tipologie di features in un unico DataFrame
df_proc = pd.concat([numerical_features, cat_encoded], axis=1)
df_proc.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker,class
0,6,1169.0,4,4,67,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,48,5951.0,2,2,22,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0,1
2,12,2096.0,2,3,49,1,2,2,0,1,1,2,0,0,0,0,0,1,1,0,0
3,42,7882.0,2,4,45,1,2,0,1,2,1,2,0,1,1,0,1,0,1,0,0
4,24,4870.0,3,4,53,2,2,0,2,3,1,1,0,0,2,0,1,0,1,0,1


In [21]:
# divido il dataset tra features e target
X = df_proc.drop(columns=['class'])
y = df_proc['class']

In [22]:
# inizializzo i modelli di oversampling e undersampling e Smote
ros = RandomOverSampler()
rus = RandomUnderSampler()
smote = SMOTE()

In [24]:
x1, y1 = ros.fit_resample(X, y)
print("After Random Over Sampling:", x1.shape, y1.shape)
x2, y2 = smote.fit_resample(x1, y1)
print("After SMOTE:", x2.shape, y2.shape)
x3, y3 = rus.fit_resample(x2, y2)
print("After Random Under Sampling:", x3.shape, y3.shape)


After Random Over Sampling: (1400, 20) (1400,)
After SMOTE: (1400, 20) (1400,)
After Random Under Sampling: (1400, 20) (1400,)


In [25]:
# standardizzo i dati prima di addestrare i modelli
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x3)

In [28]:
# divido in train e test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y3, test_size=0.2, random_state=42)

In [30]:
# Model Training
model = RandomForestClassifier(n_estimators=2000, random_state=42, n_jobs=-1)

In [31]:
# addestro il modello
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)

In [33]:
print((y_pred == 0).sum(), (y_pred == 1).sum())

127 153


In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.875
Precision: 0.8431372549019608
Recall: 0.9214285714285714
F1 Score: 0.8805460750853242
Confusion Matrix:
 [[116  24]
 [ 11 129]]
