In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
  
# metadata 
print(statlog_german_credit_data.metadata) 
  
# variable information 
print(statlog_german_credit_data.variables)

{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

# Input

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
datasetdf = pd.concat([X, y], axis=1)
X = datasetdf.loc[:, datasetdf.columns != 'class']
y = datasetdf['class']
#print(X.head())
#print("target")
#print(y.head())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Preprocessing

In [3]:
from sklearn.preprocessing import StandardScaler

#Encode categorical features
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int) #one-hot encoding
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

# Align columns (to handle category mismatches between train/test)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

#standardize
num_cols = ['Attribute2', 'Attribute5', 'Attribute8', 'Attribute11',
            'Attribute13', 'Attribute16', 'Attribute18'] #these are the numerical features
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print(X_train.head())
print("test")
print(X_test.head()) # looks good for both

     Attribute2  Attribute5  Attribute8  Attribute11  Attribute13  \
10    -0.733512   -0.713001    0.054714    -1.660121    -0.956361   
82    -0.231598   -0.610869    0.054714     1.076342    -1.047717   
827   -0.231598    0.360687   -0.835976    -0.747967     0.048549   
410    0.270317   -0.461601    0.945404     1.076342    -1.413139   
48    -0.817165    1.506577   -1.726666     1.076342     0.322615   

     Attribute16  Attribute18  Attribute1_A12  Attribute1_A13  Attribute1_A14  \
10     -0.724565    -0.434114               1               0               0   
82     -0.724565    -0.434114               0               0               1   
827     1.074000     2.303542               0               0               1   
410    -0.724565    -0.434114               1               0               0   
48      1.074000    -0.434114               0               0               1   

     ...  Attribute12_A124  Attribute14_A142  Attribute14_A143  \
10   ...                 0      

### Train model

In [4]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score

y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7800


### User constraints / Ontology constraints

In [6]:
# Implement here

# UFCE

### Feature Selection: Mutual Information (MI)

In [7]:
from itertools import combinations
from src.utilities.mutual_information import mi_score
from src.utilities.mutual_information import MI
features = list(X_train.columns)
feature_pairs = list(combinations(features, 2))
print(feature_pairs[:5])
mi_list = MI(feature_pairs, X_train) #used all features available #original authors do over entire X but it is not preprocessed... Which i think is nessecary? 
# top 5 
print(mi_list[:5])

[('Attribute2', 'Attribute5'), ('Attribute2', 'Attribute8'), ('Attribute2', 'Attribute11'), ('Attribute2', 'Attribute13'), ('Attribute2', 'Attribute16')]
[('Attribute9_A92', 'Attribute9_A93'), ('Attribute2', 'Attribute5'), ('Attribute3_A32', 'Attribute3_A34'), ('Attribute17_A172', 'Attribute17_A173'), ('Attribute12_A124', 'Attribute15_A153')]
