#### Necessary Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#Used for creating pipeline and data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

#Used for model's prediction comparison
from sklearn.metrics import classification_report

#Models used to train
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

#### Reading the Training and Final Test Data

In [2]:
train = pd.read_csv('train.csv',index_col='Loan_ID')
test = pd.read_csv('test.csv',index_col='Loan_ID')

In [3]:
train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
train['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [5]:
train.shape

(614, 12)

In [6]:
train.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Creating Train and Validation Dataset.
#### Selecting categorical and numerical dataset

In [7]:
X_full = train.drop('Loan_Status',axis=1)
y = train.Loan_Status

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Select categorical columns
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 100 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols

X_train = X_train_full.copy()
X_valid = X_valid_full.copy()

#### Creating 2 pipelines with different feature handling.

#### 1: MinMax Scaling for numerical columns and Oridnal Encoding for Categorical Columns.

#### 2: MinMax Scaling for numerical columns and One Hot Encoding for Categorical Columns.

In [8]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('imputer', IterativeImputer(initial_strategy='median',max_iter = 50))])

# Preprocessing for categorical data performing ordinal encoding
categorical_transformer_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder())
])

# Bundle preprocessing for numerical and categorical data with ordinal encoding
preprocessor_ordinal = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer_ordinal, categorical_cols)
    ])

# Preprocessing for categorical data performing one hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data with one hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#### Creating a BaggingClassifier with base model as an XGBClassifier model which has an preprocessing of categorical data with ordinal encoding.

In [9]:
xgb_bc_ordinal_model = BaggingClassifier(base_estimator=XGBClassifier(),n_estimators=100, random_state=0)
xgb_bc_ordinal_pipeline = Pipeline(steps=[('preprocessor', preprocessor_ordinal),
                              ('model', xgb_bc_ordinal_model)
                             ])
xgb_bc_ordinal_pipeline.fit(X_train,y_train)

y_true, y_pred = y_valid, xgb_bc_ordinal_pipeline.predict(X_valid)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           N       0.78      0.55      0.64        33
           Y       0.85      0.94      0.89        90

    accuracy                           0.84       123
   macro avg       0.82      0.74      0.77       123
weighted avg       0.83      0.84      0.83       123



#### Creating a BaggingClassifier with base model as an XGBClassifier model which has an preprocessing of categorical data with one hot encoding.

In [10]:

xgb_bc_onehot_model = BaggingClassifier(base_estimator=XGBClassifier(),n_estimators=100, random_state=0)
xgb_bc_onehot_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_bc_onehot_model)
                             ])
xgb_bc_onehot_pipeline.fit(X_train,y_train)

y_true, y_pred = y_valid, xgb_bc_onehot_pipeline.predict(X_valid)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           N       0.76      0.58      0.66        33
           Y       0.86      0.93      0.89        90

    accuracy                           0.84       123
   macro avg       0.81      0.75      0.77       123
weighted avg       0.83      0.84      0.83       123



#### Creating a Voting Classifier Model by combining the 2 BaggingClassifier models created above

In [11]:
trying_estimators = [('oridnal_pipeline',xgb_bc_ordinal_pipeline)
                     ,('onehot_pipeline',xgb_bc_onehot_pipeline)]


vc_model = VotingClassifier(estimators=trying_estimators, voting='soft')

vc_model = vc_model.fit(X_train,y_train)
y_true, y_pred = y_valid, vc_model.predict(X_valid)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           N       0.79      0.58      0.67        33
           Y       0.86      0.94      0.90        90

    accuracy                           0.85       123
   macro avg       0.83      0.76      0.78       123
weighted avg       0.84      0.85      0.84       123



#### Creating the submission file

In [12]:
preds_test = vc_model.predict(test)
output = pd.DataFrame({'Loan_ID': test.index,
                       'Loan_Status': preds_test})
output.to_csv('submission.csv', index=False)