Huy \
Linear Regression \
Logistic Regression\
NaiveBayes Classification\

Hưng\
Decision Tree\
K-means\
SVM\

Thanh\
RandomForest\
XGBoost\

Trung\
CatBoost\
MLP


# Load Dataset

In [3]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt

In [4]:
df = pd.read_csv('Banking.csv')
df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


# Data Analysis


Loan_ID: Unique identifier for each loan.

Loan_Amount_Requested: The amount of money requested by the customer for the loan.

Length_Employed: The duration of the customer's employment, measured in years.

Home_Owner: The home ownership status of the customer, which can be "Own", "Mortgage", "Rent", or "Other".

Annual_Income: The customer's annual income.

Income_Verified: Indicates whether the customer's income has been verified or not, which can be "Verified" or "Not Verified".

Purpose_Of_Loan: The purpose of the loan, such as "Car", "Credit Card", "debt_consolidation", or "Other".

Debt_To_Income: The customer's debt-to-income ratio.

Inquiries_Last_6Mo: The number of credit inquiries made by the customer in the last 6 months.

Months_Since_Deliquency: The number of months since the customer's last delinquency.

Number_Open_Accounts: The current number of open accounts for the customer.

Total_Accounts: The total number of accounts held by the customer.

Gender: The gender of the customer, which can be "Male" or "Female".

Interest_Rate: The interest rate applied to the loan, categorized as "1", "2", or "3".

In [5]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
df['Interest_Rate'].value_counts()

2    70580
3    59923
1    33806
Name: Interest_Rate, dtype: int64

In [7]:
df['Interest_Rate'].hist(figsize = (15,10))

<AxesSubplot:>

In [8]:
sns.countplot(x = 'Home_Owner',hue = 'Interest_Rate', data = df)

<AxesSubplot:xlabel='Home_Owner', ylabel='count'>

In [9]:
plt.figure(figsize = (8,8))
df['Income_Verified'].value_counts().plot.pie(autopct='%1.1f%%')

<AxesSubplot:ylabel='Income_Verified'>

In [10]:
plt.figure(figsize = (15,10))
sns.countplot(x = df['Income_Verified'], hue = df['Interest_Rate'])
plt.xticks(rotation = 90)
plt.show()

  plt.show()


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_ID                  164309 non-null  int64  
 1   Loan_Amount_Requested    164309 non-null  object 
 2   Length_Employed          156938 non-null  object 
 3   Home_Owner               138960 non-null  object 
 4   Annual_Income            139207 non-null  float64
 5   Income_Verified          164309 non-null  object 
 6   Purpose_Of_Loan          164309 non-null  object 
 7   Debt_To_Income           164309 non-null  float64
 8   Inquiries_Last_6Mo       164309 non-null  int64  
 9   Months_Since_Deliquency  75930 non-null   float64
 10  Number_Open_Accounts     164309 non-null  int64  
 11  Total_Accounts           164309 non-null  int64  
 12  Gender                   164309 non-null  object 
 13  Interest_Rate            164309 non-null  int64  
dtypes: f

In [12]:
df.isnull().sum()

Loan_ID                        0
Loan_Amount_Requested          0
Length_Employed             7371
Home_Owner                 25349
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

# Data Preprocessing

In [13]:
def preprocessing_data(df):
    df['Length_Employed'].fillna(df['Length_Employed'].mode()[0], inplace=True)
    df['Home_Owner'].fillna(df['Home_Owner'].mode()[0], inplace=True)
    df['Annual_Income'].fillna(df['Annual_Income'].mean(), inplace=True)
    df['Months_Since_Deliquency'].fillna(0, inplace=True)
    df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
    df.drop('Loan_ID', axis=1, inplace=True)
    
    return None
preprocessing_data(df)

In [14]:
sns.boxplot(x=df['Debt_To_Income'])

<AxesSubplot:xlabel='Debt_To_Income', ylabel='count'>

In [15]:
sns.boxplot(x=df['Annual_Income'])

<AxesSubplot:xlabel='Annual_Income', ylabel='count'>

In [16]:
sns.boxplot(x=df['Inquiries_Last_6Mo'])

<AxesSubplot:xlabel='Inquiries_Last_6Mo', ylabel='count'>

In [17]:
sns.boxplot(x=df['Number_Open_Accounts'])

<AxesSubplot:xlabel='Number_Open_Accounts', ylabel='count'>

In [18]:
sns.boxplot(x=df['Total_Accounts'])

<AxesSubplot:xlabel='Total_Accounts', ylabel='count'>

# Feature Engineering 

In [19]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cat_cols = [col for col in df.columns if df[col].dtypes=="O"] 
for col in cat_cols:
    df.loc[:, col] = encoder.fit_transform(df[col])

In [20]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot = True) # Show heatmap after normalized data

<AxesSubplot:>

In [21]:
def prepare_X_y(df):
    """
    Feature engineering and create X and y
    :param df: pandas dataframe
    :return: (X, y) output feature matrix (dataframe), target (series)
    """
    # Todo: Split data into X and y (using sklearn train_test_split). Return two dataframes
    feature_names = df.columns.tolist()
    feature_names.remove('Interest_Rate')
    
    X = df[feature_names].values
    y = df['Interest_Rate'].values
    return X, y

X, y = prepare_X_y(df)

In [22]:
df['Interest_Rate'].unique()

array([1, 3, 2], dtype=int64)

In [23]:
test = df.columns.tolist()

In [24]:
test

['Loan_Amount_Requested',
 'Length_Employed',
 'Home_Owner',
 'Annual_Income',
 'Income_Verified',
 'Purpose_Of_Loan',
 'Debt_To_Income',
 'Inquiries_Last_6Mo',
 'Months_Since_Deliquency',
 'Number_Open_Accounts',
 'Total_Accounts',
 'Gender',
 'Interest_Rate']

# Model Part

In [25]:
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [26]:
RANDOM_STATE = 79
TRAIN_SIZE = 0.7

trainX, testX ,trainY, testY = train_test_split(X_sc, y, train_size=TRAIN_SIZE, random_state=RANDOM_STATE)
print('Training:' + str(trainX.shape))
print('Test:' + str(testX.shape))

Training:(115016, 12)
Test:(49293, 12)


In [27]:
#Scaling
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
trainX = Scaler.fit_transform(trainX)
testX = Scaler.transform(testX)

### BUILD SK-LEARN

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, \
                        confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [31]:
def build_model(X, y):
    """
    Design your model and train it (including your best params)
    :param X: feature matrix
    :param y: target
    :return: a model
    """

    # Todo: Input your scaler and logistic model into pipeline
    model =LogisticRegression(penalty = 'l2', solver = 'liblinear', random_state = 42)
    model.fit(X,y)

    return model

def calculate_performance(y_true, y_pred):
    """

    :param y_true: ground truth values
    :param y_pred: predictions
    :return:
    """
    # Todo: return your error value like accuracy, f1score, ...
    print("precision", precision_score(y_true, y_pred))
    print("recall", recall_score(y_true, y_pred))
    print("accuracy", accuracy_score(y_true, y_pred))
    print("F1", f1_score(y_true, y_pred))
    print("ROC_AUC ", roc_auc_score(y_true, y_pred))

    # Todo: Only choose one of them as your score for the question 7
    main_score = f1_score(y_true, y_pred)
    return main_score

model = build_model(trainX, trainY)
# Compare on training dataset
train_pred = model.predict(trainX)
#print(#Todo: output your error)
test_pred = model.predict(testX)
#print(#Todo: output your error)

In [32]:
pred = model.predict(trainX)
calculate_performance(trainY, pred)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [33]:
pred = model.predict(testX)
calculate_performance(testY, pred)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

# Discussion and Conclusion