# Credit Scoring Model for a Financial Institution German Credit Data
#### This dataset classifies people described by a set of attributes as good or bad credit risks.

In [70]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [71]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
columns = [
    "Status",
    "Duration",
    "CreditHistory",
    "Purpose",
    "CreditAmount",
    "SavingsAccount",
    "EmploymentSince",
    "InstallmentRate",
    "PersonalStatusSex",
    "OtherDebtors",
    "ResidenceSince",
    "Property",
    "Age",
    "OtherInstallmentPlans",
    "Housing",
    "ExistingCredits",
    "Job",
    "PeopleLiable",
    "Telephone",
    "ForeignWorker",
    "CreditRisk",
]
data = pd.read_csv(url, delim_whitespace=True, header=None, names = columns)

In [72]:
data.head()

Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,SavingsAccount,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtors,...,Property,Age,OtherInstallmentPlans,Housing,ExistingCredits,Job,PeopleLiable,Telephone,ForeignWorker,CreditRisk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [73]:
data.shape

(1000, 21)

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Status                 1000 non-null   object
 1   Duration               1000 non-null   int64 
 2   CreditHistory          1000 non-null   object
 3   Purpose                1000 non-null   object
 4   CreditAmount           1000 non-null   int64 
 5   SavingsAccount         1000 non-null   object
 6   EmploymentSince        1000 non-null   object
 7   InstallmentRate        1000 non-null   int64 
 8   PersonalStatusSex      1000 non-null   object
 9   OtherDebtors           1000 non-null   object
 10  ResidenceSince         1000 non-null   int64 
 11  Property               1000 non-null   object
 12  Age                    1000 non-null   int64 
 13  OtherInstallmentPlans  1000 non-null   object
 14  Housing                1000 non-null   object
 15  ExistingCredits       

In [75]:
for column in data.select_dtypes(include=["object"]):
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

In [76]:
data.head()

Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,SavingsAccount,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtors,...,Property,Age,OtherInstallmentPlans,Housing,ExistingCredits,Job,PeopleLiable,Telephone,ForeignWorker,CreditRisk
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,2


In [77]:
data['CreditRisk'] = data['CreditRisk'].apply(lambda x: 1 if x==1 else 0)
data.head()

Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,SavingsAccount,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtors,...,Property,Age,OtherInstallmentPlans,Housing,ExistingCredits,Job,PeopleLiable,Telephone,ForeignWorker,CreditRisk
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,0
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,0


In [78]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1].values

In [79]:
#scale the feature of the data
data_col = [col for col in X.columns]
sc = StandardScaler()
X[data_col] = sc.fit_transform(data[data_col])

In [80]:
X.head()

Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,SavingsAccount,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtors,ResidenceSince,Property,Age,OtherInstallmentPlans,Housing,ExistingCredits,Job,PeopleLiable,Telephone,ForeignWorker
0,-1.254566,-1.236478,1.344014,0.264068,-0.745131,1.833169,1.338078,0.918477,0.449326,-0.303686,1.046987,-1.293723,2.766456,0.460831,0.13371,1.027079,0.146949,-0.42829,1.214598,-0.196014
1,-0.459026,2.248194,-0.503428,0.264068,0.949817,-0.699707,-0.317959,-0.870183,-0.96365,-0.303686,-0.765977,-1.293723,-1.191404,0.460831,0.13371,-0.704926,0.146949,-0.42829,-0.823318,-0.196014
2,1.132053,-0.738668,1.344014,1.359785,-0.416562,-0.699707,0.51006,-0.870183,0.449326,-0.303686,0.140505,-1.293723,1.183312,0.460831,0.13371,-0.704926,-1.383771,2.334869,-0.823318,-0.196014
3,-1.254566,1.750384,-0.503428,-0.101171,1.634247,-0.699707,0.51006,-0.870183,0.449326,3.885083,1.046987,-0.341055,0.831502,0.460831,2.016956,-0.704926,0.146949,2.334869,-0.823318,-0.196014
4,-1.254566,0.256953,0.420293,-1.196889,0.566664,-0.699707,-0.317959,0.024147,0.449326,-0.303686,1.046987,1.564281,1.535122,0.460831,2.016956,1.027079,0.146949,2.334869,-0.823318,-0.196014


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((700, 20), (300, 20))

In [82]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [83]:
y_pred = model.predict(X_test)

In [84]:
accuracy_score(y_test, y_pred)

0.7533333333333333

In [85]:
class Accuracy_Class:
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        self.accuracy = {}

    def Data_Models(self):
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                self.X_data, self.y_data, test_size=0.2, random_state=42
            )

            classification_algorithms = {
                "Logistic Regression": LogisticRegression(),
                "Decision Tree": DecisionTreeClassifier(),
                "Random Forest": RandomForestClassifier(),
                "Support Vector Machine (SVM)": SVC(),
                "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
                "Naive Bayes": GaussianNB(),
            }

            param_grids = {
                "Logistic Regression": {"clf__C": [0.1, 1, 10]},
                "Decision Tree": {"clf__max_depth": [None, 10, 20, 30]},
                "Random Forest": {"clf__n_estimators": [10, 50, 100]},
                "Support Vector Machine (SVM)": {"clf__C": [0.1, 1, 10], "clf__kernel": ["linear", "rbf"]},
                "K-Nearest Neighbors (KNN)": {"clf__n_neighbors": [3, 5, 7]},
                "Naive Bayes": {}
            }

            for name, clf in classification_algorithms.items():
                pipeline = Pipeline([
                    ('scaler', StandardScaler()),  # Feature scaling
                    ('clf', clf)
                ])

                param_grid = param_grids[name]
                grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_

                y_pred = best_model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                self.accuracy[name] = accuracy

            return self.accuracy

        except Exception as e:
            print(e)

# Assuming X_data and y_data are your features and labels
Accuracy_Class_obj = Accuracy_Class(X_train, y_train)
best_accuracy = Accuracy_Class_obj.Data_Models()
print(best_accuracy)

{'Logistic Regression': 0.7785714285714286, 'Decision Tree': 0.75, 'Random Forest': 0.7642857142857142, 'Support Vector Machine (SVM)': 0.7714285714285715, 'K-Nearest Neighbors (KNN)': 0.7428571428571429, 'Naive Bayes': 0.7785714285714286}


In [86]:
best_accuracy_df = pd.DataFrame(list(best_accuracy.items()), columns=['Model', 'Accuracy'])
best_accuracy_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.778571
1,Decision Tree,0.75
2,Random Forest,0.764286
3,Support Vector Machine (SVM),0.771429
4,K-Nearest Neighbors (KNN),0.742857
5,Naive Bayes,0.778571


## Thank You