In [1]:

import requests
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
# General libraries
import numpy as np
import pandas as pd
import os
import joblib

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report



#machine learning Algorithms For Classification

from sklearn.ensemble import  AdaBoostClassifier


In [2]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
print(statlog_german_credit_data.metadata)

# variable information
print(statlog_german_credit_data.variables)


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last

In [3]:

data = pd.concat([X, y], axis=1)
data.head()

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [6]:
# Rename columns based on your provided mapping
new_column_names = {
    'Attribute1': 'Status of existing checking account',
    'Attribute2': 'Duration in month',
    'Attribute3': 'Credit history',
    'Attribute4': 'Purpose',
    'Attribute5': 'Credit amount',
    'Attribute6': 'Savings account/bonds',
    'Attribute7': 'Present employment since',
    'Attribute8': 'Installment rate in percentage of disposable income',
    'Attribute9': 'Personal status and sex',
    'Attribute10': 'Other debtors / guarantors',
    'Attribute11': 'Present residence since',
    'Attribute12': 'Property',
    'Attribute13': 'Age in years',
    'Attribute14': 'Other installment plans',
    'Attribute15': 'Housing',
    'Attribute16': 'Number of existing credits at this bank',
    'Attribute17': 'Job',
    'Attribute18': 'Number of people being liable to provide maintenance for',
    'Attribute19': 'Telephone',
    'Attribute20': 'Foreign worker',
    'class': 'Credit Risk'
}

data = data.rename(columns=new_column_names)
data.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Credit Risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [7]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of existing checking account                       1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   Personal status and sex      

In [8]:

# Identify object columns
object_columns = data.select_dtypes(include=['object']).columns
print(object_columns)

# Loop through object columns and apply appropriate conversion
for col in object_columns:
    if col == 'Credit Risk':  #handle target variable
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    elif data[col].nunique() <= 5 : #Handle features with low cardinality
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    else: #Handle features with high cardinality
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded_features = ohe.fit_transform(data[[col]])
        encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data, encoded_df], axis=1).drop(columns=col)

data.info()

Index(['Status of existing checking account', 'Credit history', 'Purpose',
       'Savings account/bonds', 'Present employment since',
       'Personal status and sex', 'Other debtors / guarantors', 'Property',
       'Other installment plans', 'Housing', 'Job', 'Telephone',
       'Foreign worker'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Status of existing checking account                       1000 non-null   int64  
 1   Duration in month                                         1000 non-null   int64  
 2   Credit history                                            1000 non-null   int64  
 3   Credit amount                                             1000 non-null   int64  
 4   Savings account/bonds                      

In [15]:

data.describe()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,Present residence since,...,Purpose_A40,Purpose_A41,Purpose_A410,Purpose_A42,Purpose_A43,Purpose_A44,Purpose_A45,Purpose_A46,Purpose_A48,Purpose_A49
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.577,20.903,2.545,3271.258,1.105,2.384,2.973,1.682,0.145,2.845,...,0.234,0.103,0.012,0.181,0.28,0.012,0.022,0.05,0.009,0.097
std,1.257638,12.058814,1.08312,2822.736876,1.580023,1.208306,1.118715,0.70808,0.477706,1.103718,...,0.423584,0.304111,0.10894,0.385211,0.449224,0.10894,0.146757,0.218054,0.094488,0.296106
min,0.0,4.0,0.0,250.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,12.0,2.0,1365.5,0.0,2.0,2.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,18.0,2.0,2319.5,0.0,2.0,3.0,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,24.0,4.0,3972.25,2.0,4.0,4.0,2.0,0.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,3.0,72.0,4.0,18424.0,4.0,4.0,4.0,3.0,2.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:

# Histograms for numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

# Box plots for numerical features
for col in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Correlation matrix heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Countplots for categorical features
categorical_features = data.select_dtypes(include=['int64']).columns  # After encoding, categoricals are int64
for col in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=data[col])
    plt.title(f'Countplot of {col}')
    plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
    plt.show()

# Pairplot for selected numerical features (to avoid excessive plotting)
selected_numerical_features = ['Duration in month', 'Credit amount', 'Age in years']
sns.pairplot(data, vars=selected_numerical_features, hue='Credit Risk')
plt.show()


# Relationship between Credit Risk and other features
for col in data.columns:
    if col != 'Credit Risk':
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='Credit Risk', y=col, data=data)
        plt.title(f'Credit Risk vs. {col}')
        plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [19]:

X=data.drop('Credit Risk',axis=1)
y=data['Credit Risk']
# Assuming X and y are defined from previous code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Accuracy: 0.795
Precision: 0.8378378378378378
Recall: 0.8794326241134752
F1 Score: 0.8581314878892734
Confusion Matrix:
[[124  17]
 [ 24  35]]
              precision    recall  f1-score   support

           1       0.84      0.88      0.86       141
           2       0.67      0.59      0.63        59

    accuracy                           0.80       200
   macro avg       0.76      0.74      0.74       200
weighted avg       0.79      0.80      0.79       200



In [20]:

# Initialize and train the AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier(n_estimators=50, random_state=42) # You can adjust n_estimators
adaboost_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adaboost_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)



print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(confusion_mat)


print(classification_report(y_test, y_pred))


Accuracy: 0.795
Precision: 0.8378378378378378
Recall: 0.8794326241134752
F1 Score: 0.8581314878892734
Confusion Matrix:
[[124  17]
 [ 24  35]]
              precision    recall  f1-score   support

           1       0.84      0.88      0.86       141
           2       0.67      0.59      0.63        59

    accuracy                           0.80       200
   macro avg       0.76      0.74      0.74       200
weighted avg       0.79      0.80      0.79       200

