In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('german_credit_data.csv')
df.head(10)


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car
5,5,35,male,1,free,,,9055,36,education
6,6,53,male,2,own,quite rich,,2835,24,furniture/equipment
7,7,35,male,3,rent,little,moderate,6948,36,car
8,8,61,male,1,own,rich,,3059,12,radio/TV
9,9,28,male,3,own,little,moderate,5234,30,car


In [8]:
df.drop(columns = ['Unnamed: 0'], inplace=True)

In [9]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,2,own,,little,1169,6,radio/TV
1,22,female,2,own,little,moderate,5951,48,radio/TV
2,49,male,1,own,little,,2096,12,education
3,45,male,2,free,little,little,7882,42,furniture/equipment
4,53,male,2,free,little,little,4870,24,car


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


In [11]:
df.isnull().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64

In [12]:
df['Saving accounts'].fillna('Unknown',inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Saving accounts'].fillna('Unknown',inplace= True)


In [13]:
saving_map = {'none': 0, 'little': 1, 'moderate': 2, 'rich': 3, 'quite rich': 4}
checking_map = {'unknown': 0, 'little': 1, 'moderate': 2, 'rich': 3}
df['Saving accounts'] = df['Saving accounts'].map(saving_map).fillna(0)
df['Checking account'] = df['Checking account'].map(checking_map).fillna(0)


In [14]:
df.head(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,2,own,0.0,1.0,1169,6,radio/TV
1,22,female,2,own,1.0,2.0,5951,48,radio/TV
2,49,male,1,own,1.0,0.0,2096,12,education
3,45,male,2,free,1.0,1.0,7882,42,furniture/equipment
4,53,male,2,free,1.0,1.0,4870,24,car
5,35,male,1,free,0.0,0.0,9055,36,education
6,53,male,2,own,4.0,0.0,2835,24,furniture/equipment
7,35,male,3,rent,1.0,2.0,6948,36,car
8,61,male,1,own,3.0,0.0,3059,12,radio/TV
9,28,male,3,own,1.0,2.0,5234,30,car


In [15]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Sex', 'Housing', 'Purpose']

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])
    

In [16]:
df.head(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,1,2,1,0.0,1.0,1169,6,5
1,22,0,2,1,1.0,2.0,5951,48,5
2,49,1,1,1,1.0,0.0,2096,12,3
3,45,1,2,0,1.0,1.0,7882,42,4
4,53,1,2,0,1.0,1.0,4870,24,1
5,35,1,1,0,0.0,0.0,9055,36,3
6,53,1,2,1,4.0,0.0,2835,24,4
7,35,1,3,2,1.0,2.0,6948,36,1
8,61,1,1,1,3.0,0.0,3059,12,5
9,28,1,3,1,1.0,2.0,5234,30,1


In [17]:
def determine_risk(row, df): 
    risk_score = 0
    if row['Credit amount'] > df['Credit amount'].median():
        risk_score += 1
    if row['Job'] == 0 or row['Job'] == 1:
        risk_score += 1
    if row['Saving accounts'] <= 1:  
        risk_score += 1
    if row['Checking account'] < df['Checking account'].median():
        risk_score += 1
    if row['Housing'] in [1, 2]:  
        risk_score += 1
    if row['Age'] < 25 or row['Age'] > 65:
        risk_score += 1
    if row['Purpose'] in [6, 7]:  
        risk_score += 1
    if row['Duration'] > 36:
        risk_score += 1
    return 1 if risk_score >= 4 else 0

# Apply Risk determination
df['Risk'] = df.apply(lambda row: determine_risk(row, df), axis=1)

In [18]:
df.head(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,0.0,1.0,1169,6,5,0
1,22,0,2,1,1.0,2.0,5951,48,5,1
2,49,1,1,1,1.0,0.0,2096,12,3,1
3,45,1,2,0,1.0,1.0,7882,42,4,0
4,53,1,2,0,1.0,1.0,4870,24,1,0
5,35,1,1,0,0.0,0.0,9055,36,3,1
6,53,1,2,1,4.0,0.0,2835,24,4,0
7,35,1,3,2,1.0,2.0,6948,36,1,0
8,61,1,1,1,3.0,0.0,3059,12,5,1
9,28,1,3,1,1.0,2.0,5234,30,1,0


In [19]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Risk'])
y = df['Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,1,2,1,0.0,1.0,1169,6,5
1,22,0,2,1,1.0,2.0,5951,48,5
2,49,1,1,1,1.0,0.0,2096,12,3
3,45,1,2,0,1.0,1.0,7882,42,4
4,53,1,2,0,1.0,1.0,4870,24,1


In [21]:
print(y.head())

0    0
1    1
2    1
3    0
4    0
Name: Risk, dtype: int64


In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(700, 9)
(300, 9)
(700,)
(300,)


In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)
y_pred

array([1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0])

In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9733333333333334

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       196
           1       0.98      0.94      0.96       104

    accuracy                           0.97       300
   macro avg       0.97      0.97      0.97       300
weighted avg       0.97      0.97      0.97       300


Confusion Matrix:
 [[194   2]
 [  6  98]]


In [26]:
import pickle

# Assuming 'model' is your trained model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)