In [1]:
import math 
import statistics

import pandas as pd
import numpy as np
np.random.seed(42)
# --------------------------------------
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
# --------------------------------------
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Preprocessing
In this section, there will be implementation of various pre-processing techniques on the "Loan Risk Factor" dataset.<br/>
<b><u>It will include the following</u></b>:
* 1.pre About Dataset - (review only - description of the dataset)
* 1.a. Load dataset                 
* 1.b. Duplicate removal
* 1.c. Decouple Dataset
* 1.d. Train Test Split
* 1.e. Missing Values Handling 
* 1.f. Outlier Removal  
* 1.g. Scaling   
* 1.h. Check Correlation  
* 1.i. Categorical Columns Handling 

#### 1.pre. About Dataset
Each person is classified as **good** or **bad** credit risks according to the following set of attributes:    
1. **age** (numeric)
2. **sex** (text: male, female)
3. **job** (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
4. **housing** (text: own, rent, or free)
5. **years** (numeric, in years)
5. **saving_account** (text - little, moderate, quite rich, rich)
6. **checking_account** (numeric, in DM - Deutsch Mark)
7. **credit_amount** (numeric, in DM)
8. **duration** (numeric, in month)
9. **purpose** (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)
10. **risk** (target) - (0 - risk, 1 - no risk)

In [2]:
file_name_csv = 'loans_risk_factor_dups.csv'
load_csv = lambda file_name: pd.read_csv(file_name)
risk_factor_df = load_csv(file_name_csv)
risk_factor_df.head()

Unnamed: 0,age,sex,job,housing,years,saving_account,checking_account,credit_amount,duration,purpose,risk
0,26,male,2,own,1.0,little,rich,1330,12.0,car,1.0
1,43,male,2,own,2.0,,,2197,24.0,car,1.0
2,28,female,2,rent,1.25,little,little,1403,15.0,car,1.0
3,25,male,0,own,1.5,little,little,2473,18.0,furniture/equipment,0.0
4,29,male,2,rent,0.5,little,,2108,6.0,radio/TV,


In [3]:
def remove_duplicates(risk_factor_df):
    print("Number of duplicated rows: ",(risk_factor_df.duplicated()).sum())
    risk_factor_df = risk_factor_df.drop_duplicates()
    return risk_factor_df

In [4]:
risk_factor_df_no_dups = remove_duplicates(risk_factor_df)
print('\n')
risk_factor_df_no_dups.info()

Number of duplicated rows:  1039


<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 0 to 1578
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               551 non-null    int64  
 1   sex               551 non-null    object 
 2   job               551 non-null    int64  
 3   housing           551 non-null    object 
 4   years             547 non-null    float64
 5   saving_account    463 non-null    object 
 6   checking_account  372 non-null    object 
 7   credit_amount     551 non-null    int64  
 8   duration          547 non-null    float64
 9   purpose           551 non-null    object 
 10  risk              543 non-null    float64
dtypes: float64(3), int64(3), object(5)
memory usage: 51.7+ KB


In [5]:
def decouple_data(risk_factor_df):
    y = risk_factor_df['risk']
    X = risk_factor_df.drop(columns=['risk']).copy()
    return X, y

In [6]:
X, y = decouple_data(risk_factor_df_no_dups)
X.head()
y.shape

Unnamed: 0,age,sex,job,housing,years,saving_account,checking_account,credit_amount,duration,purpose
0,26,male,2,own,1.0,little,rich,1330,12.0,car
1,43,male,2,own,2.0,,,2197,24.0,car
2,28,female,2,rent,1.25,little,little,1403,15.0,car
3,25,male,0,own,1.5,little,little,2473,18.0,furniture/equipment
4,29,male,2,rent,0.5,little,,2108,6.0,radio/TV


(551,)

In [7]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42, shuffle=False)
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = split_data(X, y)

print("train shape ",X_train.shape)
print("train label shape ",y_train.shape)
print("test shape ",X_test.shape)
print("test label shape ",y_test.shape)

train shape  (440, 10)
train label shape  (440,)
test shape  (111, 10)
test label shape  (111,)


In [9]:
def handle_missing_values(X_train, X_test, y_train, y_test):
    #Handling first the test because it should be based on the original train set's statistics
    for col in ['saving_account','checking_account']:
        X_test[col].fillna(X_train[col].mode().iloc[0],inplace=True)
        X_train[col].fillna(X_train[col].mode().iloc[0],inplace=True)
    X_test['credit_amount'].fillna(X_train['credit_amount'].median(),inplace=True)    
    X_train['credit_amount'].fillna(X_train['credit_amount'].median(),inplace=True)
    
    for col in ['years','duration']:
        X_test[col].fillna(X_train[col].mean(),inplace=True)
        X_train[col].fillna(X_train[col].mean(),inplace=True)
    
    for idx, val in y_train.items():
        if pd.isna(val):
            X_train = X_train.drop(idx)
    y_train = y_train.dropna()
    
    for idx, val in y_test.items():
        if pd.isna(val):
            X_test = X_test.drop(idx)
    y_test = y_test.dropna()

    return X_train, X_test, y_train, y_test

In [10]:
X_train_filled_val, X_test_filled_val, y_train_filled_val, y_test_filled_val = handle_missing_values(X_train, X_test, y_train, y_test)

X_train_filled_val.info()
y_train_filled_val.shape

X_test_filled_val.shape
y_test_filled_val.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 432 entries, 0 to 824
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               432 non-null    int64  
 1   sex               432 non-null    object 
 2   job               432 non-null    int64  
 3   housing           432 non-null    object 
 4   years             432 non-null    float64
 5   saving_account    432 non-null    object 
 6   checking_account  432 non-null    object 
 7   credit_amount     432 non-null    int64  
 8   duration          432 non-null    float64
 9   purpose           432 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 37.1+ KB


(432,)

(111, 10)

(111,)

In [11]:
def remove_outliers(X_train, X_test, y_train, y_test):
    cols_for_iqr = ['age','years','credit_amount','duration']
    idx_list_train = []
    idx_list_test = []
    for col in cols_for_iqr:
        Q1 = X_train[col].quantile(0.25)
        Q3 = X_train[col].quantile(0.75)
        IQR = Q3 - Q1
        
        for row_idx, row_val in X_train[col].items():
            if (row_val<(Q1-1.5*IQR))or (row_val>(Q3+1.5*IQR)):
                idx_list_train.append(row_idx)
                
        for row_idx, row_val in X_test[col].items(): # this is still calculated by the IQR of train
            if (row_val<(Q1-1.5*IQR))or (row_val>(Q3+1.5*IQR)):
                idx_list_test.append(row_idx)

    X_train.drop(idx_list_train, inplace=True)
    y_train.drop(idx_list_train, inplace=True)
    X_test.drop(idx_list_test, inplace=True)
    y_test.drop(idx_list_test, inplace=True)
    
    return X_train, X_test, y_train, y_test
    

In [12]:
X_train_no_outliers, X_test_no_outliers, y_train, y_test = remove_outliers(X_train_filled_val, X_test_filled_val, y_train_filled_val, y_test_filled_val)

print("train shape ",X_train_no_outliers.shape)
print("train label shape ",y_train.shape)
print("test shape ",X_test_no_outliers.shape)
print("test label shape ",y_test.shape)

train shape  (377, 10)
train label shape  (377,)
test shape  (93, 10)
test label shape  (93,)


In [13]:
def scale_data(X_train, X_test):
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    col_to_scale = ['age','years','credit_amount','duration']
    scaler = MinMaxScaler(feature_range=(0, 1)) # shell
    
    for col in col_to_scale:
        X_train_scaled[col] = scaler.fit_transform(X_train[[col]]) # calculate the values and apply them on the data 
        X_test_scaled[col] = scaler.transform(X_test[[col]]) # apply the values (obtained from the train) on the test data

    return X_train_scaled, X_test_scaled

In [14]:
X_train_scaled, X_test_scaled = scale_data(X_train_no_outliers, X_test_no_outliers)

In [15]:
def remove_correlated(X_train, X_test):
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    X_train = X_train.drop(to_drop, axis=1)
    X_test = X_test.drop(to_drop, axis=1)
    
    return X_train, X_test

In [16]:
X_train_indep, X_test_indep = remove_correlated(X_train_scaled, X_test_scaled)

X_train_scaled.shape
X_train_indep.shape
X_test_indep.shape

(377, 10)

(377, 9)

(93, 9)

In [17]:
def handle_categorical_columns(X_train, X_test):
    train_encoded_columns = X_train.copy()
    col_to_dumm = list(X_train.select_dtypes(include=['object']).columns)
    X_train_encoded = pd.get_dummies(data=X_train, columns=col_to_dumm)
    X_test_encoded = pd.get_dummies(data=X_test, columns=col_to_dumm)
    X_test_encoded = X_test_encoded.reindex(columns = X_train_encoded.columns, fill_value=0)
    return X_train_encoded, X_test_encoded

In [18]:
X_train_to_numeric, X_test_to_numeric = handle_categorical_columns(X_train_indep, X_test_indep)

X_train_to_numeric.shape
X_test_to_numeric.shape

(377, 24)

(93, 24)

In [19]:
def train_knn(X_train, y_train):
    k = 3
    clf_knn= KNeighborsClassifier(n_neighbors=k)
    clf_knn.fit(X_train,y_train)
    return clf_knn

In [20]:
knn_clf = train_knn(X_train_to_numeric, y_train)
knn_clf

KNeighborsClassifier(n_neighbors=3)

In [21]:
def train_naive_bayes(X_train, y_train):
    clf_gnb = GaussianNB()
    clf_gnb.fit(X_train, y_train)
    return clf_gnb

In [22]:
nb_clf = train_naive_bayes(X_train_to_numeric, y_train)
nb_clf

GaussianNB()

In [23]:
def train_decision_tree(X_train, y_train):
    clf_dt = DecisionTreeClassifier(max_depth=3, min_samples_split=20)
    clf_dt.fit(X_train, y_train)
    return clf_dt

In [24]:
dt_clf = train_decision_tree(X_train_to_numeric, y_train)
dt_clf

DecisionTreeClassifier(max_depth=3, min_samples_split=20)

In [25]:
def evaluate_accuracy(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

In [26]:
print("The accuracy of the KNN model is:")
evaluate_accuracy(knn_clf, X_test_to_numeric, y_test)
print('\n')
print("The accuracy of the Naive Bayes model is:")
evaluate_accuracy(nb_clf, X_test_to_numeric, y_test)
print('\n')
print("The accuracy of the Decision Tree model is:")
evaluate_accuracy(dt_clf, X_test_to_numeric, y_test)


The accuracy of the KNN model is:


0.5053763440860215



The accuracy of the Naive Bayes model is:


0.6344086021505376



The accuracy of the Decision Tree model is:


0.5268817204301075

In [27]:
def evaluate_accuracy_cross_validation(clf, X_train_to_numeric, X_test_to_numeric, y_train, y_test):
    X = X_train_to_numeric.append([X_test_to_numeric])
    y = y_train.append([y_test])
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
    mean_accuracy = scores.mean()
    return mean_accuracy

In [28]:
knn_cv_acc_score = evaluate_accuracy_cross_validation(KNeighborsClassifier(n_neighbors=3)
                                                      , X_train_to_numeric, X_test_to_numeric, y_train, y_test)
nb_cv_acc_score = evaluate_accuracy_cross_validation(GaussianNB(),
                                                     X_train_to_numeric, X_test_to_numeric, y_train, y_test)
dt_cv_acc_score = evaluate_accuracy_cross_validation(
                            DecisionTreeClassifier(max_depth=3, min_samples_split=20,random_state=42),
                            X_train_to_numeric, X_test_to_numeric, y_train, y_test)
print("The mean Cross Validation score of the KNN model is:", knn_cv_acc_score)
print("The mean Cross Validation score of the Naive Bayes model is:", nb_cv_acc_score)
print("The mean Cross Validation score of the Decision Tree model is:", dt_cv_acc_score)

The mean Cross Validation score of the KNN model is: 0.5276595744680852
The mean Cross Validation score of the Naive Bayes model is: 0.5829787234042554
The mean Cross Validation score of the Decision Tree model is: 0.5553191489361703


In [29]:
def grid_search_knn(clf, X_train, y_train):
    param_grid = {'n_neighbors': [3, 5, 11, 20, 35, 50],
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan', 'minkowski']}
    optimal_params = GridSearchCV(
        clf,
        param_grid,
        cv = 5)
    grid_result = optimal_params.fit(X_train,y_train)
    optimal_params.fit(X_train,y_train)
    result = optimal_params.best_params_
    return result

In [30]:
grid_search_knn_result = grid_search_knn(KNeighborsClassifier(), X_train_to_numeric, y_train)
print(grid_search_knn_result)

{'metric': 'euclidean', 'n_neighbors': 35, 'weights': 'uniform'}


In [31]:
def grid_search_decision_tree(clf, X_train, y_train):
    param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'min_samples_split': [5, 10, 15, 20, 25],
              'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    optimal_params = GridSearchCV(
        clf,
        param_grid,
        cv = 5)
    grid_result = optimal_params.fit(X_train,y_train)
    optimal_params.fit(X_train,y_train)
    result = optimal_params.best_params_
    return result

In [32]:
grid_search_dt_result = grid_search_decision_tree(DecisionTreeClassifier(random_state=42), X_train_to_numeric, y_train)
print(grid_search_dt_result)

{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 15}
