<a href="https://colab.research.google.com/github/ABDULRAFAY757/Bondara_raw/blob/main/pipeline_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
import pickle

sns.set()
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [None]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_percentage_error, mean_squared_error, roc_auc_score, log_loss, precision_recall_fscore_support, mean_absolute_error, plot_roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from imblearn.over_sampling import RandomOverSampler, SMOTE
from eli5.sklearn import PermutationImportance
from pprint import pprint

from xgboost import XGBRegressor, XGBClassifier

---

In [None]:
pd.set_option('display.max_columns', None)

---

### Loading Data, Creating Target variable, and Preprocessing:

**Loading data**

In [None]:
loan_data = pd.read_csv('Bondora_preprocessed.csv')

print(loan_data.isnull().sum().sum())
loan_data.columns

0


Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer',
       'VerificationType', 'LanguageCode', 'Age', 'Gender', 'Country',
       'AppliedAmount', 'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment',
       'UseOfLoan', 'Education', 'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'LiabilitiesTotal', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash',
       'Rating', 'Restructured', 'CreditScoreEsMicroL',
       'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade',
       'PrincipalBalance', 'InterestAndPenaltyBalance',
       'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan',
       'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus'],
      dtype='object')

#### **Creating Target Variables**

I. Equaty Monthly Installments (EMI)

In [None]:
def cal_EMI(P, r, n):
  P = P.values
  r = r.values
  n = n.values
  #print(P.shape[0])
  result_1 = np.empty(0)
  result_2 = np.empty(0)
  result = np.empty(0)
  for i in range(P.shape[0]):
    #print(P[i])
    #print(r[i])
    #print(n[i])
    # EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)
    #print(P[i] * (1 + r[i]))
    result_1 = np.append(result_1, P[i] * r[i] * np.power((1 + r[i]),n[i]))
    result_2 = np.append(result_2, np.power((1 + r[i]),n[i]) - 1)
    result = np.append(result, (result_1[i] / result_2[i]))

  return result

In [None]:
loan_data['EMI'] = cal_EMI(loan_data['Amount'], loan_data['Interest'], loan_data['LoanDuration'])

II. Eligible Loan Amount (ELA)

In [None]:
# Step 1
loan_data['Ava_Inc'] = ((loan_data['IncomeTotal']-loan_data['LiabilitiesTotal'])*0.3)
loan_data['Total_Loan_Amnt'] = np.round((loan_data['AppliedAmount'] + (loan_data['AppliedAmount'] * loan_data['Interest']) /100)*loan_data['LoanDuration'])

In [None]:
# Step 2
def eligible_loan_amnt(df):
  Ava_Inc = df['Ava_Inc'].values
  Total_Loan_Amnt = df['Total_Loan_Amnt'].values
  ELA = np.empty(0)
  for i in range(len(Ava_Inc)):
    if Total_Loan_Amnt[i] <= Ava_Inc[i]:
      ELA = np.append(ELA, Total_Loan_Amnt[i])
    else:
      ELA = np.append(ELA, Ava_Inc[i])
  return ELA

In [None]:
loan_data['ELA'] = eligible_loan_amnt(loan_data)

III. Preferred ROI (PROI) ---> needs redefining ---> Redefined

In [None]:
def PROI(df):
    # Calculate ROI
    loan_data['InterestAmount'] = (loan_data['Amount']*(loan_data['Interest']/100))
    loan_data['TotalAmount'] = (loan_data['InterestAmount'] + loan_data['Amount'])
    loan_data['ROI'] = (loan_data['InterestAmount'] / loan_data['TotalAmount'])*100
    
    # Setting PROI
    df['PROI'] = loan_data['ROI'].median()

    for i in range(df.shape[0]):
        # Check out LoanDuration
        if df['LoanDuration'].loc[i] <= 30:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        
        # Check out AppliedAmount
        if (df['AppliedAmount'].loc[i] <= 1175) & (df['AppliedAmount'].loc[i] >= 850):
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['AppliedAmount'].loc[i] > 2000:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

        # Check out IncomeTotal
        if df['IncomeTotal'].loc[i] <= 1000:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5

        # Check out DebtToIncome
        if df['DebtToIncome'].loc[i] == 0:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        else:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

    return df['PROI']

In [None]:
loan_data['PROI'] = PROI(loan_data)

VI. LoanStatus

In [None]:
loan_data['LoanStatus'] = np.where(loan_data['LoanStatus']=='NoDefault', 1, 0)

#### **Handling Outliers:**

In [None]:
# Let's compute IQR for each numerical feature
df_IQR = loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) - loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25)

# Let's compute maximum and minimum limits
df_Max =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

In [None]:
# Loop for replacing outliers above upper bound with the upper bound value:
for column in loan_data.select_dtypes([float, int]).columns :
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Max =  loan_data[column].quantile(.75) + (1.5*col_IQR)
    loan_data[column][loan_data[column] > col_Max] =  col_Max

In [None]:
# Loop for replacing outliers under lower bound with the lower bound value:
for column in loan_data.select_dtypes([float, int]).columns :
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Min =  loan_data[column].quantile(.25) - (1.5*col_IQR)
    loan_data[column][loan_data[column] < col_Min] =  col_Min

#### 3. **X, y split**

In [None]:
# Defining Independent variables Dataset
X = loan_data.drop(['EMI', 'ELA', 'PROI', 'LoanStatus'], axis=1)

# Assigning target variables for both Models 
y_reg = loan_data[['EMI', 'ELA', 'ROI']]
y_class = loan_data['LoanStatus']

#### 4. Feature Selection

In [None]:
# A function to select highly correlated features.
def Correlation(dataset, threshold): 
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

In [None]:
# let's selected features with a correlation factor > 0.8
Correlation(X, 0.8)

{'Amount',
 'AmountOfPreviousLoansBeforeLoan',
 'NoOfPreviousLoansBeforeLoan',
 'TotalAmount',
 'Total_Loan_Amnt'}

In [None]:
# Now we can drop these features from our dataset
X.drop(columns= ['AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan', 'TotalAmount'], inplace = True )

---

#### **train, test split**

In [None]:
#Train Test for Classification
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.25, random_state=0)

# Train Test for Regression
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X, y_reg, test_size=0.25, random_state=0)

---

## data preprocessing

In [None]:
# Defining StandardScaler
stdscaler = StandardScaler()

# Defining PCA with no. of features = 110
pca = PCA(n_components=110)

---

## classification

**ensemble learning**

In [None]:
qda = QuadraticDiscriminantAnalysis()

rf = RandomForestClassifier(random_state=0)

In [None]:
estimators = [('QDA', qda), ('RandomForest', rf)]
final_estimator = GradientBoostingClassifier()
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
model_ensemble = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', clf)
])

In [None]:
model_ensemble.fit(X_train, yStatus_train)
pred = model_ensemble.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  90.02 %


In [None]:
print(classification_report(yStatus_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.902     0.998     0.948     11465
           1      0.435     0.013     0.024       797
           2      0.000     0.000     0.000       343
           3      0.259     0.059     0.096       119

    accuracy                          0.900     12724
   macro avg      0.399     0.267     0.267     12724
weighted avg      0.843     0.900     0.856     12724



**QDA**

In [None]:
model_qda = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', qda)
])

In [None]:
model_qda.fit(X_train, yClosed_train)
pred = model_qda.predict(X_test)
print('test accuracy = ', round(accuracy_score(yClosed_test, pred)*100, 2), '%')

test accuracy =  89.84 %


In [None]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.310     0.110     0.162      1140
           1      0.918     0.976     0.946     11584

    accuracy                          0.898     12724
   macro avg      0.614     0.543     0.554     12724
weighted avg      0.863     0.898     0.876     12724



**Random Forest**

In [None]:
model_rf = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', rf)
])

In [None]:
model_rf.fit(X_train, yStatus_train)
pred = model_rf.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  88.68 %


In [None]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.082     0.883     0.150      1140
           1      0.648     0.015     0.030     11584
           2      0.000     0.000     0.000         0
           3      0.000     0.000     0.000         0

    accuracy                          0.093     12724
   macro avg      0.183     0.225     0.045     12724
weighted avg      0.597     0.093     0.040     12724



---

## saving model

In [None]:
pickle.dump(model_ensemble, open('../models/ensemble.pkl', 'wb'))
pickle.dump(model_qda, open('../models/qda.pkl', 'wb'))
pickle.dump(model_rf, open('../models/rf.pkl', 'wb'))

In [None]:
# to load the model :
# pickled_model = pickle.load(open('model.pkl', 'rb'))
# pickled_model.predict(X_test)

---