In [1]:

import math 
import statistics

import pandas as pd
import numpy as np
np.random.seed(42)
# --------------------------------------
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
# --------------------------------------
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


<a id="preprocessing"></a>
### 3. Preprocessing

In this section, there will be implementation of various pre-processing techniques on the "Loan Risk Factor" dataset.<br/>
<b><u>It will include the following</u></b>:
* 3.pre About Dataset 
* 3.a. Load dataset  
* 3.b. Duplicate removal
* 3.c. Decouple Dataset
* 3.d. Train Test Split
* 3.e. Missing Values Handling 
* 3.f. Outlier Removal
* 3.g. Scaling 
* 3.h. Check Correlation
* 3.i. Categorical Columns Handling

#### 3.pre. About Dataset
In this section of the assignment you will get familiar with a dataset named "Loan Risk Factor" documenting banks' clients asking for a loan.<br/>
Each row in the dataset represents a person who takes a credit by a bank.<br/>
Each person is classified as **good** or **bad** credit risks according to the following set of attributes:    
1. **age** (numeric)
2. **sex** (text: male, female)
3. **job** (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
4. **housing** (text: own, rent, or free)
5. **years** (numeric, in years)
5. **saving_account** (text - little, moderate, quite rich, rich)
6. **checking_account** (numeric, in DM - Deutsch Mark)
7. **credit_amount** (numeric, in DM)
8. **duration** (numeric, in month)
9. **purpose** (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)
10. **risk** (target) - (0 - risk, 1 - no risk)

In [2]:
file_name_csv = 'data\loans_risk_factor_dups.csv'
load_csv = lambda file_name: pd.read_csv(file_name)
risk_factor_df = load_csv(file_name_csv)
risk_factor_df.head()

Unnamed: 0,age,sex,job,housing,years,saving_account,checking_account,credit_amount,duration,purpose,risk
0,26,male,2,own,1.0,little,rich,1330,12.0,car,1.0
1,43,male,2,own,2.0,,,2197,24.0,car,1.0
2,28,female,2,rent,1.25,little,little,1403,15.0,car,1.0
3,25,male,0,own,1.5,little,little,2473,18.0,furniture/equipment,0.0
4,29,male,2,rent,0.5,little,,2108,6.0,radio/TV,


In [3]:
risk_factor_df.shape

(1590, 11)

#### in this assignment i was handed with a dataset named "Loan Risk Factor" documenting banks' clients asking for a loan.
Each row in the dataset represents a person who takes a credit by a bank.
Each person is classified as good or bad credit risks according to a set of attributes:

In [4]:
print('number of duplicated rows is:', risk_factor_df.duplicated().sum())


number of duplicated rows is: 1039


In [5]:
#checking for duplications
def remove_duplicates(df):
    df = df.drop_duplicates()
    return df


In [6]:
no_duplicates_df = remove_duplicates(risk_factor_df)
no_duplicates_df.info()
no_duplicates_df.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 0 to 1578
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               551 non-null    int64  
 1   sex               551 non-null    object 
 2   job               551 non-null    int64  
 3   housing           551 non-null    object 
 4   years             547 non-null    float64
 5   saving_account    463 non-null    object 
 6   checking_account  372 non-null    object 
 7   credit_amount     551 non-null    int64  
 8   duration          547 non-null    float64
 9   purpose           551 non-null    object 
 10  risk              543 non-null    float64
dtypes: float64(3), int64(3), object(5)
memory usage: 51.7+ KB


(551, 11)

In [7]:
#splitting to feature vectors (X) and labels (y)

def decouple_data(risk_factor_df):
    y = risk_factor_df['risk']
    X = risk_factor_df.drop(columns=['risk'])
    return X,y



In [8]:
#checked how many nulls are there in y column
X,y = decouple_data(no_duplicates_df)
print("inline", y.isnull().sum())

inline 8


##### splitting the data using sklearn train_test_split


In [9]:

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42,shuffle=False) 
    return X_train, X_test, y_train, y_test


In [10]:
X_train, X_test, y_train, y_test =  split_data(X, y)
print("train shape ",X_train.shape)
print("train label shape ",y_train.shape)
print("test shape ",X_test.shape)
print("test label shape ",y_test.shape)

train shape  (440, 10)
train label shape  (440,)
test shape  (111, 10)
test label shape  (111,)


##### handling missing values of the DataFrame - various ways.    


In [11]:
# handling missing values of the DataFrame with various ways.    
def handle_missing_values(X_train, X_test, y_train, y_test):
    #been asked to Replace missing values in "saving_account" & "checking_account" columns with most frequent values
    for col in ["saving_account", "checking_account"]:
        X_train[col].fillna(X_train[col].value_counts().index[0], inplace = True)
        X_test[col].fillna(X_train[col].value_counts().index[0], inplace = True)
    # been asked to Replace missing values in "credit_amount" with the median
    X_train.credit_amount.fillna(X_train.median(), inplace=True)
    X_test.credit_amount.fillna(X_train.median(), inplace=True)
  
    # Replace missing values in "years" & "duration" with the average
    for col in ["years", "duration"]:
        X_train[col].fillna(X_train[col].mean(), inplace = True)
        X_test[col].fillna(X_train[col].mean(), inplace = True)

    # Delete Rows with missing values in "risk" column
    y_train.dropna(axis=0, how = all, inplace = True)
    y_test.dropna(axis=0, how = all, inplace = True)
 
    # the Return of the datasets after implementing these steps
    X_train = X_train.loc[(X_train.index.isin(y_train.index))]
    X_test = X_test.loc[(X_test.index.isin(y_test.index))]

    return X_train, X_test, y_train, y_test
    

In [12]:
X_train_filled, X_test_filled, y_train_filled, y_test_filled = handle_missing_values(X_train, X_test, y_train, y_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [13]:
X_train_filled.shape
X_test_filled.shape
y_train_filled.shape
y_test_filled.shape

(432, 10)

(111, 10)

(432,)

(111,)

##### Removing outliers in age, years, credit_amount & duration colomns using IQR


In [14]:
def remove_outliers(X_train, X_test, y_train, y_test):
    
    X_train_clean = X_train.copy()
    X_test_clean = X_test.copy()
    for col in ["age", "years", "credit_amount", "duration"]:
        Q1 = np.percentile(X_train[col], 25)
        Q3 = np.percentile(X_train[col], 75)
        IQR = Q3 - Q1
        X_train_clean.loc[(X_train_clean[col] < Q1 - 1.5*IQR) | (X_train_clean[col] > Q3 + 1.5*IQR ),col] = np.nan
        X_test_clean.loc[(X_test_clean[col] < Q1 - 1.5*IQR) | (X_test_clean[col] > Q3 + 1.5*IQR ),col] = np.nan

    X_train_clean.dropna(inplace = True)
    X_test_clean.dropna(inplace = True)
    y_train = y_train[list(X_train_clean.index)]
    y_test = y_test[list(X_test_clean.index)]

    return X_train_clean, X_test_clean, y_train, y_test

In [15]:
X_train_no_out , X_test_no_out, y_train, y_test = remove_outliers(X_train_filled, X_test_filled, y_train_filled, y_test_filled)

In [16]:
print('The new shapes after removing outliers are:\n')
print("train features",X_train_no_out.shape)
print("train label ",y_train.shape)
print("test features ",X_test_no_out.shape)
print("test label ",y_test.shape)

The new shapes after removing outliers are:

train features (377, 10)
train label  (377,)
test features  (93, 10)
test label  (93,)


##### Scaling the columns above in the range [0,1] using the Sklearn MinMaxScaler

In [17]:

def scale_data(X_train, X_test):
    scaler = MinMaxScaler(feature_range=(0,1))
    minmax_X_train = X_train.copy()
    minmax_X_test = X_test.copy()

    for col in ["age", "years", "credit_amount", "duration"]:
        minmax_X_train[col] = scaler.fit_transform(X_train[[col]])
        minmax_X_test[col] = scaler.transform(X_test[[col]])
        
    return minmax_X_train, minmax_X_test


In [18]:
minmax_X_train, minmax_X_test = scale_data(X_train_no_out, X_test_no_out)

##### Removing correlated columns when correlation value > 0.95, removing the last column

In [19]:

def remove_correlated(X_train, X_test):

    X_train_corr = X_train.copy()
    correlated_features = []
    correlation_matrix = X_train_corr.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.append(colname)
                
    X_train = X_train.drop(correlated_features, axis = 1)
    X_test = X_test.drop(correlated_features, axis = 1)            

    return X_train, X_test

In [20]:
X_train_no_corr, X_test_no_corr = remove_correlated(minmax_X_train, minmax_X_test)

In [21]:
X_train_no_corr.shape
X_test_no_corr.shape

(377, 9)

(93, 9)

In [22]:
# Handling categorical columns using dummies 

def handle_categorical_columns(X_train, X_test):
    X_train_object = list(X_train.select_dtypes(include=['object']))
    X_train_dum = pd.get_dummies(data = X_train,columns = X_train_object)
    X_test_dum = pd.get_dummies(data = X_test,columns = X_train_object)
    X_test_dum = X_test_dum.reindex(columns = X_train_dum.columns, fill_value=0)
    
    return X_train_dum, X_test_dum
    


In [23]:
X_train_numeric, X_test_numeric = handle_categorical_columns(X_train_no_corr, X_test_no_corr)

In [24]:
X_train_numeric.shape
X_test_numeric.shape

(377, 24)

(93, 24)

### Been asked to try 3 different classifiers: Knn, Naive Bayes & Decision tree

In [25]:
#Knn with 3 Neighbours

def train_knn(X_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=3)
    knn_clf = knn.fit(X_train, y_train)
    return knn_clf

In [26]:
clf_knn = train_knn(X_train_numeric, y_train)

In [27]:
#Naive Bayes

def train_naive_bayes(X_train, y_train):
    gnb = GaussianNB()
    # Train classifier
    clf = gnb.fit(X_train, y_train)
    return clf

In [28]:
nb_clf = train_naive_bayes(X_train_numeric, y_train)

In [29]:
#Decision Tree Classifier with max depth of 3 and min split of 20 
def train_decision_tree(X_train, y_train):
    tree = DecisionTreeClassifier(max_depth =3,min_samples_split = 20)
    clf = tree.fit(X_train, y_train)
    return clf
    

In [30]:
dt_clf = train_decision_tree(X_train_numeric, y_train)

In [31]:
#Accuracy evaluation

def evaluate_accuracy(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [32]:
print('The accuracy evaluation for each classifier:\n')
print('Knn Classifier:\n', evaluate_accuracy(clf_knn, X_test_numeric, y_test))
print('Naive Bayes Classifier:\n', evaluate_accuracy(nb_clf, X_test_numeric, y_test))
print('Decision Tree Classifier:\n', evaluate_accuracy(dt_clf, X_test_numeric, y_test))

The accuracy evaluation for each classifier:

Knn Classifier:
 0.5053763440860215
Naive Bayes Classifier:
 0.6344086021505376
Decision Tree Classifier:
 0.5268817204301075


In [33]:
#Been asked to calculate 10-fold cross-validation mean accuracy score

def evaluate_accuracy_cross_validation(clf, X, y):
    mean_accuracy = cross_val_score(clf, X, y, cv = 10)
    return mean_accuracy.mean()

In [34]:
# HyperParameters Tuning - knn grid search with the following params:
#'n_neighbors' with values [3, 5, 11, 20, 35, 50] 
#'weights' with values ['uniform', 'distance']
#'metric' with values ['euclidean', 'manhattan', 'minkowski']


def grid_search_knn(clf, X_train, y_train):

    # Creating the hyperparameter grid
    param_grid = {'n_neighbors': [3, 5, 11, 20, 35, 50], 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan', 'minkowski']}

    # Instantiating the GridSearchCV object
    grid_knn = GridSearchCV(clf, param_grid, cv = 5)

    grid_knn.fit(X_train, y_train)
    
    Y = grid_knn.best_params_
    
    return Y


In [35]:
Knn_grid_result = grid_search_knn(KNeighborsClassifier(), X_train_numeric, y_train)
print(Knn_grid_result)

{'metric': 'euclidean', 'n_neighbors': 35, 'weights': 'uniform'}


In [36]:
def train_knn_by_Gs(X_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=35,weights = 'uniform',metric = 'euclidean')
    knn_clf1 = knn.fit(X_train, y_train)
    return knn_clf1

In [37]:
clf_knn_gs = train_knn_by_Gs(X_train_numeric, y_train)
print('Knn Classifier:\n', evaluate_accuracy(clf_knn_gs, X_test_numeric, y_test))


Knn Classifier:
 0.5806451612903226


In [38]:
# HyperParameters Tuning - Decision Tree grid search with the following params
#'criterion' with the values ['gini', 'entropy']
#'max_depth' with the values [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#'min_samples_split' with the values [5, 10, 15, 20, 25]
#'min_samples_leaf' with the values [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


def grid_search_decision_tree(clf, X_train, y_train):
    param_grid = {'criterion': ['gini', 'entropy'], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                  'min_samples_split':[5, 10, 15, 20, 25],'min_samples_leaf':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    
    # Instantiating the GridSearchCV object
    grid_dt = GridSearchCV(clf, param_grid, cv = 5)

    grid_dt.fit(X_train, y_train)
    
    Y = grid_dt.best_params_
    
    return Y

In [39]:
dt_grid_result = grid_search_decision_tree(DecisionTreeClassifier(), X_train_numeric, y_train)
print(dt_grid_result)

{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 15}


In [40]:
def train_decision_tree_by_gs(X_train, y_train):
    tree = DecisionTreeClassifier(max_depth =10,min_samples_split = 15,min_samples_leaf = 1,criterion = 'gini')
    clf1 = tree.fit(X_train, y_train)
    return clf1

In [41]:
dt_clf_gs = train_decision_tree_by_gs(X_train_numeric, y_train)
print('Knn Classifier:\n', evaluate_accuracy(dt_clf_gs, X_test_numeric, y_test))


Knn Classifier:
 0.5698924731182796
