In [60]:
#%run Credit_DataWrangling.ipynb

In [61]:
#%run Credit_DataEDA.ipynb

In [62]:
#%run Credit_DataPreprocessingandTrainingDataDevelopment.ipynb

In [21]:
#%run Credit_DataModeling.ipynb

In [22]:
#In order to use the train_test_split, first we must import it from sklearn's model_selection
#To use the support vector classifier, we must import it from sklearn's svm
#To get a view of the metric readings and proper prediction diagram display, we must import confusion_matrix and 
#classification_report from sklearn's metrics

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

In [23]:
#Getting a quick glance of this dataset where the checking status of the credit applicants is the main focus to determine
#how much this feature factors into whether the applicant is classed as good or bad for a credit loan

checking_status_one_hot.head()

Unnamed: 0,class,checking_status_0<=X<200,checking_status_<0,checking_status_>=200,checking_status_no checking
0,good,0,1,0,0
1,bad,1,0,0,0
2,good,0,0,0,1
3,good,0,1,0,0
4,bad,0,1,0,0


In [24]:
#Making use of the info method to get a reading of the data within this dataframe such as its columns, the non-null counts,
#and the data types

checking_status_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   class                        1000 non-null   object
 1   checking_status_0<=X<200     1000 non-null   uint8 
 2   checking_status_<0           1000 non-null   uint8 
 3   checking_status_>=200        1000 non-null   uint8 
 4   checking_status_no checking  1000 non-null   uint8 
dtypes: object(1), uint8(4)
memory usage: 11.8+ KB


In [25]:
#Splitting the data into X and y variables to be used in the train_test_split function; the X variable holds the features
#with the numerical data to predict the categories in the y variable which holds the feature with the labels.
#Some preprocessing of the data set is required such as dropping unnecessary features(columns) and making sure to drop any
#null values as the SVM cannot use or recognize these values; it also interferes with the functioning of the classifier. 
#Preprocessing of the data is also useful to modify the dataset to both fit and address the hypothesis we wish to answer.

X = checking_status_one_hot.drop('class', axis=1)

y = checking_status_one_hot['class']

In [None]:
#I ultimately decided upon the Gaussian SVM model as the data I'm working with to address the problem identification is best
#suited for an algorithm built for complex and non-linear patterns within a dataset. Also, the gaussian kernel performed the
#best overall and produced the best results for the dataset.

In [26]:
#Splitting the X and y variables into train and test sets with the train_test_split function, and setting the test size to
#20% of data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [27]:
#Gaussian Kernel
#To use the Gaussian kernel, the kernel parameter in the SVC function must be set to 'rbf'
#Need to use the fit method to fit the SVM model according to the data

svclassifier2 = SVC(kernel='rbf')
svclassifier2.fit(X_train, y_train);

In [28]:
#Making the prediction based on the X_test set. The predict method of the SVM object is used to do this

ypred2 = svclassifier2.predict(X_test)

In [33]:
#Evaluations for the Gaussian kernel - The Gaussian kernel was overall the best in performance for the dataset as it 
#achieves the highest scores and best classifications throughout.
#The confusion_matrix function takes the y test and y prediction and outputs the mini-diagram below to illustrate the 
#prediction performance of the model.
#The classification_report also takes the y test and y prediction and outputs various metric scores of the model
#performance; most notably precision, recall, f1-score, and accuracy.
#Originally did not encounter the zero_division problem in the original run in the Credit_DataModeling file

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2, zero_division=1))

[[  0  73]
 [  0 127]]
              precision    recall  f1-score   support

         bad       1.00      0.00      0.00        73
        good       0.64      1.00      0.78       127

    accuracy                           0.64       200
   macro avg       0.82      0.50      0.39       200
weighted avg       0.77      0.64      0.49       200



In [34]:
#The dataset where the credit_history of the credit loan applicants is the feature in focus to determine the influential
#factor of this feature on whether the applicant is classified as good or bad.

credit_history_one_hot.head()

Unnamed: 0,class,credit_history_all paid,credit_history_critical/other existing credit,credit_history_delayed previously,credit_history_existing paid,credit_history_no credits/all paid
0,good,0,1,0,0,0
1,bad,0,0,0,1,0
2,good,0,1,0,0,0
3,good,0,0,0,1,0
4,bad,0,0,1,0,0


In [35]:
credit_history_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                                         Non-Null Count  Dtype 
---  ------                                         --------------  ----- 
 0   class                                          1000 non-null   object
 1   credit_history_all paid                        1000 non-null   uint8 
 2   credit_history_critical/other existing credit  1000 non-null   uint8 
 3   credit_history_delayed previously              1000 non-null   uint8 
 4   credit_history_existing paid                   1000 non-null   uint8 
 5   credit_history_no credits/all paid             1000 non-null   uint8 
dtypes: object(1), uint8(5)
memory usage: 12.8+ KB


In [36]:
X_2 = credit_history_one_hot.drop('class', axis=1)

y_2 = credit_history_one_hot['class']

In [37]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.20)

In [41]:
svclassifier_5 = SVC(kernel='rbf')
svclassifier_5.fit(X_train_2, y_train_2);

In [42]:
y_pred_5 = svclassifier_5.predict(X_test_2)

In [43]:
print(confusion_matrix(y_test_2, y_pred_5))
print(classification_report(y_test_2, y_pred_5))

[[  7  50]
 [  8 135]]
              precision    recall  f1-score   support

         bad       0.47      0.12      0.19        57
        good       0.73      0.94      0.82       143

    accuracy                           0.71       200
   macro avg       0.60      0.53      0.51       200
weighted avg       0.65      0.71      0.64       200



In [44]:
#The dataset where the employment history of the credit loan applicant is the focus to determine how much of an influential
#factor this feature plays in whether the applicant is classified as good or bad.

employment_data_one_hot.head()

Unnamed: 0,class,employment_1<=X<4,employment_4<=X<7,employment_<1,employment_>=7,employment_unemployed
0,good,0,0,0,1,0
1,bad,1,0,0,0,0
2,good,0,1,0,0,0
3,good,0,1,0,0,0
4,bad,1,0,0,0,0


In [45]:
employment_data_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   class                  1000 non-null   object
 1   employment_1<=X<4      1000 non-null   uint8 
 2   employment_4<=X<7      1000 non-null   uint8 
 3   employment_<1          1000 non-null   uint8 
 4   employment_>=7         1000 non-null   uint8 
 5   employment_unemployed  1000 non-null   uint8 
dtypes: object(1), uint8(5)
memory usage: 12.8+ KB


In [46]:
X_3 = employment_data_one_hot.drop('class', axis=1)

y_3 = employment_data_one_hot['class']

In [47]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.20)

In [48]:
svclassifier_8 = SVC(kernel='rbf')

svclassifier_8.fit(X_train_3, y_train_3);

In [49]:
y_pred_8 = svclassifier_8.predict(X_test_3)

In [51]:
print(confusion_matrix(y_test_3, y_pred_8))
print(classification_report(y_test_3, y_pred_8, zero_division=1))

[[  0  62]
 [  0 138]]
              precision    recall  f1-score   support

         bad       1.00      0.00      0.00        62
        good       0.69      1.00      0.82       138

    accuracy                           0.69       200
   macro avg       0.84      0.50      0.41       200
weighted avg       0.79      0.69      0.56       200



In [52]:
#The dataset which takes into account all the features that were previously run through in the SVM model in the prior 
#datasets to determine overall the performance of the SVM's classifier to view how all of the features influence whether
#a credit loan applicant gets classified as good or bad

merged_credit_data.head()

Unnamed: 0,class,checking_status_0<=X<200,checking_status_<0,checking_status_>=200,checking_status_no checking,credit_history_all paid,credit_history_critical/other existing credit,credit_history_delayed previously,credit_history_existing paid,credit_history_no credits/all paid,employment_1<=X<4,employment_4<=X<7,employment_<1,employment_>=7,employment_unemployed
0,good,0,1,0,0,0,1,0,0,0,0,0,0,1,0
1,bad,1,0,0,0,0,0,0,1,0,1,0,0,0,0
2,good,0,0,0,1,0,1,0,0,0,0,1,0,0,0
3,good,0,1,0,0,0,0,0,1,0,0,1,0,0,0
4,bad,0,1,0,0,0,0,1,0,0,1,0,0,0,0


In [53]:
merged_credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                                         Non-Null Count  Dtype 
---  ------                                         --------------  ----- 
 0   class                                          1000 non-null   object
 1   checking_status_0<=X<200                       1000 non-null   uint8 
 2   checking_status_<0                             1000 non-null   uint8 
 3   checking_status_>=200                          1000 non-null   uint8 
 4   checking_status_no checking                    1000 non-null   uint8 
 5   credit_history_all paid                        1000 non-null   uint8 
 6   credit_history_critical/other existing credit  1000 non-null   uint8 
 7   credit_history_delayed previously              1000 non-null   uint8 
 8   credit_history_existing paid                   1000 non-null   uint8 
 9   credit_history_no credits/all paid             1000 non-null   u

In [54]:
X_4 = merged_credit_data.drop('class', axis=1)

y_4 = merged_credit_data['class']

In [55]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, test_size=0.20)

In [57]:
svclassifier_11 = SVC(kernel='rbf')

svclassifier_11.fit(X_train_4, y_train_4);

In [58]:
y_pred_11 = svclassifier_11.predict(X_test_4)

In [59]:
print(confusion_matrix(y_test_4, y_pred_11))

print(classification_report(y_test_4, y_pred_11))

[[ 23  37]
 [ 24 116]]
              precision    recall  f1-score   support

         bad       0.49      0.38      0.43        60
        good       0.76      0.83      0.79       140

    accuracy                           0.69       200
   macro avg       0.62      0.61      0.61       200
weighted avg       0.68      0.69      0.68       200

