In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from ucimlrepo import fetch_ucirepo 

In [9]:

  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
  
# metadata 
print(statlog_german_credit_data.metadata) 
  
# variable information 
print(statlog_german_credit_data.variables) 

{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

In [10]:
# Separate categorical and numerical columns
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(exclude='object').columns.tolist()

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids dummy trap
X_cat = encoder.fit_transform(X[categorical])

# Standardize numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(X[numerical])

# Combine features
# X_all = np.hstack([X[numerical], X_cat])
X_all = np.hstack([X_num, X_cat])


In [11]:
# Convert y to 1D array (if needed)
y = y.iloc[:, 0] if isinstance(y, pd.DataFrame) else y

# Ensure binary classification format: 0 = good, 1 = bad
y = y.map({1: 0, 2: 1})

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic regression training
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.73      0.80       140
           1       0.55      0.78      0.65        60

    accuracy                           0.74       200
   macro avg       0.72      0.76      0.72       200
weighted avg       0.79      0.74      0.75       200

ROC AUC Score: 0.8073809523809524


In [14]:
X_train

array([[ 1.25257373,  1.79480979,  0.02414692, ...,  0.        ,
         0.        ,  0.        ],
       [-0.73866754, -0.87450324,  0.91847717, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.25257373,  0.74423963,  0.02414692, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.24085723, -0.26805064,  0.02414692, ...,  0.        ,
         1.        ,  0.        ],
       [-0.24085723,  1.84301409, -1.76451358, ...,  0.        ,
         1.        ,  0.        ],
       [-0.48976238, -0.08586674, -0.87018333, ...,  0.        ,
         0.        ,  0.        ]])