##   New Model

In [55]:
# Importing libraries 
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [56]:
df = pd.read_csv('train_custom.csv')
df.head()

Unnamed: 0,ID,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,74768,LP002231,1,1,0,1,0,8328,0.0,17,363,1,2,1,6000
1,79428,LP001448,1,1,0,0,0,150,3857.458782,188,370,1,1,0,6000
2,70497,LP002231,0,0,0,0,0,4989,314.472511,17,348,1,0,0,6000
3,87480,LP001385,1,1,0,0,0,150,0.0,232,359,1,1,1,3750
4,33964,LP002231,1,1,1,0,0,8059,0.0,17,372,1,0,1,3750


In [57]:
df.shape

(5902, 15)

In [58]:
df['Dependents'] = df['Dependents'].str.replace('3+', '3').astype('int64')
df['Loan_ID'] = df['Loan_ID'].str.replace('LP', '').astype('int64')


In [59]:
df.head()

Unnamed: 0,ID,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,74768,2231,1,1,0,1,0,8328,0.0,17,363,1,2,1,6000
1,79428,1448,1,1,0,0,0,150,3857.458782,188,370,1,1,0,6000
2,70497,2231,0,0,0,0,0,4989,314.472511,17,348,1,0,0,6000
3,87480,1385,1,1,0,0,0,150,0.0,232,359,1,1,1,3750
4,33964,2231,1,1,1,0,0,8059,0.0,17,372,1,0,1,3750


In [60]:
# Create new features

# 1. Loan-to-Income Ratio: Measures the proportion of loan amount to the applicant's income.
df['Loan_to_Income_Ratio'] = df['LoanAmount'] / (df['ApplicantIncome'] + 1)  # Add 1 to avoid division by zero

# 2. Total Income: Sum of applicant and co-applicant incomes (if applicable).
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# 3. Income Stability Indicator: Categorical variable based on income brackets (low, medium, high).
income_bins = [0, 2500, 4000, 6000, float('inf')]
income_labels = ['Low', 'Medium', 'High', 'Very High']
df['Income_Stability'] = pd.cut(df['ApplicantIncome'], bins=income_bins, labels=income_labels)

# Encoding categorical feature for Income_Stability
df['Income_Stability_Encoded'] = df['Income_Stability'].cat.codes

# Compute the dependents income ratio

# Replace NaN or 0 values in Dependents with 1 to avoid division by zero
df['Dependents'] = df['Dependents'].replace(0, 1)
df['Dependents_Income_Ratio'] = df['Total_Income'] / df['Dependents']

# Verify the new features
print(df[['Loan_to_Income_Ratio', 'Total_Income', 'Income_Stability', 'Income_Stability_Encoded']].head())


   Loan_to_Income_Ratio  Total_Income Income_Stability  \
0              0.002041   8328.000000        Very High   
1              1.245033   4007.458782              Low   
2              0.003407   5303.472511             High   
3              1.536424    150.000000              Low   
4              0.002109   8059.000000        Very High   

   Income_Stability_Encoded  
0                         3  
1                         0  
2                         2  
3                         0  
4                         3  


In [61]:
# Handle missing or zero values in 'LoanAmount' and 'ApplicantIncome' before calculation
df['ApplicantIncome'] = df['ApplicantIncome'].replace(0, 1)  # Avoid division by zero

# Compute Debt-to-Income Ratio (DTI)
df['Debt_to_Income_Ratio'] = df['LoanAmount'] / (df['ApplicantIncome'] + df['CoapplicantIncome'])

# Replace any NaN or inf values in the new feature (edge cases)
df['Debt_to_Income_Ratio'].replace([float('inf'), -float('inf')], 0, inplace=True)
df['Debt_to_Income_Ratio'].fillna(0, inplace=True)

In [112]:
df_now = df[['ApplicantIncome', 'Loan_to_Income_Ratio', 'Income_Stability', 'Married',
             'LoanAmount', 'Total_Income', 'Debt_to_Income_Ratio', 'Income_Stability_Encoded',
             'Self_Employed',
             'Dependents_Income_Ratio', 'Dependents']]
df_now

Unnamed: 0,ApplicantIncome,Loan_to_Income_Ratio,Income_Stability,Married,LoanAmount,Total_Income,Debt_to_Income_Ratio,Income_Stability_Encoded,Self_Employed,Dependents_Income_Ratio,Dependents
0,8328,0.002041,Very High,1,17,8328.000000,0.002041,3,0,8328.000000,1
1,150,1.245033,Low,1,188,4007.458782,0.046913,0,0,4007.458782,1
2,4989,0.003407,High,0,17,5303.472511,0.003205,2,0,5303.472511,1
3,150,1.536424,Low,1,232,150.000000,1.546667,0,0,150.000000,1
4,8059,0.002109,Very High,1,17,8059.000000,0.002109,3,0,8059.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
5897,1297,0.013097,Low,1,17,4590.124489,0.003704,0,0,4590.124489,1
5898,7358,0.002310,Very High,1,17,7358.000000,0.002310,3,0,7358.000000,1
5899,150,1.125828,Low,1,170,150.000000,1.133333,0,0,75.000000,2
5900,2230,0.099507,Low,0,222,2230.000000,0.099552,0,0,2230.000000,1


In [113]:
df_now_cat = pd.get_dummies(df_now).astype(int)
df_now_cat.head()

Unnamed: 0,ApplicantIncome,Loan_to_Income_Ratio,Married,LoanAmount,Total_Income,Debt_to_Income_Ratio,Income_Stability_Encoded,Self_Employed,Dependents_Income_Ratio,Dependents,Income_Stability_Low,Income_Stability_Medium,Income_Stability_High,Income_Stability_Very High
0,8328,0,1,17,8328,0,3,0,8328,1,0,0,0,1
1,150,1,1,188,4007,0,0,0,4007,1,1,0,0,0
2,4989,0,0,17,5303,0,2,0,5303,1,0,0,1,0
3,150,1,1,232,150,1,0,0,150,1,1,0,0,0
4,8059,0,1,17,8059,0,3,0,8059,1,0,0,0,1


In [184]:
# importation of necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score, recall_score, mean_absolute_error
import xgboost as xgb
from sklearn.model_selection import cross_val_score

In [115]:
# applying normalization
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_now_cat)
len(df_scaled)

5902

In [116]:
y = df['Loan_Status']

In [165]:
X_train, X_test, y_train, y_test = train_test_split(df_now_cat, y, 
                                                    test_size=0.2, random_state=42)

In [118]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [119]:
X_train.shape

(4721, 14)

In [120]:
# Evaluate the models
def evaluate(y_test, y_pred):
    print('F1 Score: ', f1_score(y_test, y_pred))
    print('Recall Score: ', recall_score(y_test, y_pred))
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print('Precision score', precision_score(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))

In [166]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

In [167]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
evaluate(y_test, y_pred_rf)

F1 Score:  0.8981132075471697
Recall Score:  0.952
Accuracy score:  0.817104149026249
Precision score 0.85
MAE:  0.18289585097375105


In [168]:
y_pred_rf[:100]

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [124]:

y_test[:100].values

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1], dtype=int64)

In [179]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [182]:
xgb_pred = xgb_model.predict(X_test)
evaluate(y_test, xgb_pred)

F1 Score:  0.9061338289962826
Recall Score:  0.975
Accuracy score:  0.8289585097375106
Precision score 0.8463541666666666
MAE:  0.1710414902624894


### ANN

In [125]:
# simple neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [169]:
# Build the Second ANN model
model1 = Sequential()
model1.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  # Input layer with 64 neurons
model1.add(Dropout(0.2))                                              # Dropout layer
model1.add(Dense(32, activation='relu'))                              # Hidden layer with 32 neurons
model1.add(Dense(1,  activation='sigmoid'))                            # Output layer for binary classification

# Compile the model
model1.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model1.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.5826 - loss: 495.9828 - val_accuracy: 0.8466 - val_loss: 22.8042
Epoch 2/200
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7052 - loss: 61.8215 - val_accuracy: 0.8455 - val_loss: 24.9190
Epoch 3/200
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7372 - loss: 30.3446 - val_accuracy: 0.8455 - val_loss: 13.6625
Epoch 4/200
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7361 - loss: 21.9961 - val_accuracy: 0.8466 - val_loss: 7.3516
Epoch 5/200
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6955 - loss: 19.4380 - val_accuracy: 0.8095 - val_loss: 2.5317
Epoch 6/200
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7199 - loss: 10.7414 - val_accuracy: 0.8466 - val_loss: 7.0291
Epoch 7/200
[1m118/118

<keras.src.callbacks.history.History at 0x20689136f10>

In [170]:
# Prediction for the training and testing dataset
ann_pred_test = (model1.predict(X_test) > 0.5).astype(int).flatten()
# Evaluating the model
evaluate(y_test, ann_pred_test)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
F1 Score:  0.9170105456212747
Recall Score:  1.0
Accuracy score:  0.8467400508044031
Precision score 0.8467400508044031
MAE:  0.15325994919559696


In [171]:
ann_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### Rotating Forest

In [172]:
# Using PCA


from sklearn.decomposition import PCA


# Define the number of subsets and PCA components per subset
num_subsets = 3
subset_size = X_train.shape[1] // num_subsets
rotated_features_train = []
rotated_features_test = []


for i in range(num_subsets):
    # Select a subset of features
    subset_indices = np.random.choice(X_train.shape[1], subset_size, replace=False)
    X_train_subset = X_train[:, subset_indices]
    X_test_subset = X_test[:, subset_indices]

    # Apply PCA to the subset of features
    pca = PCA(n_components=subset_size)
    rotated_train = pca.fit_transform(X_train_subset)
    rotated_test = pca.transform(X_test_subset)

    # Store the rotated features
    rotated_features_train.append(rotated_train)
    rotated_features_test.append(rotated_test)

    # Concatenate all rotated features
X_train_rotated = np.hstack(rotated_features_train)
X_test_rotated = np.hstack(rotated_features_test)

InvalidIndexError: (slice(None, None, None), array([ 0, 10, 11,  3]))

In [None]:
### Step 2: Train the Rotation Forest with Decision Trees (J48-like) as Base Models

# Use Random Forest as an ensemble of decision trees
rotation_forest_model = RandomForestClassifier(
    criterion='gini',
    n_estimators=100,  # Number of J48-like trees
    max_depth=5,      # Limit depth for simplicity (similar to J48 constraints)
    random_state=42
)
rotation_forest_model.fit(X_train_rotated, y_train)

In [131]:
# Step 3: Evaluate the Model
y_pred_rot = rotation_forest_model.predict(X_test_rotated)
evaluate(y_test, y_pred_rot)

F1 Score:  0.9170105456212747
Recall Score:  1.0
Accuracy score:  0.8467400508044031
Precision score 0.8467400508044031
MAE:  0.15325994919559696


In [132]:
y_pred_rot[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### Stacking classification

In [133]:

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

stack_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack_clf.fit(X_train, y_train)


In [134]:
stack_pred = stack_clf.predict(X_test)
evaluate(y_test, stack_pred)

F1 Score:  0.9170105456212747
Recall Score:  1.0
Accuracy score:  0.8467400508044031
Precision score 0.8467400508044031
MAE:  0.15325994919559696


In [135]:
stack_pred[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [175]:
# Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [176]:
nb_pred = nb_model.predict(X_test)
evaluate(y_test, nb_pred)

F1 Score:  0.9025304592314901
Recall Score:  0.963
Accuracy score:  0.8238780694326842
Precision score 0.8492063492063492
MAE:  0.17612193056731584


In [138]:
nb_pred[:100]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [177]:
# Calculating the features importance
features_names = df_now_cat.columns.unique()
features_names

importance = rf_model.feature_importances_
importance
imp_df = pd.DataFrame({
    'Features': features_names, 'Importance': importance}).sort_values(
    'Importance', ascending=False)
imp_df

Unnamed: 0,Features,Importance
4,Total_Income,0.256096
8,Dependents_Income_Ratio,0.255401
0,ApplicantIncome,0.236057
3,LoanAmount,0.193355
9,Dependents,0.016033
2,Married,0.015504
7,Self_Employed,0.013569
1,Loan_to_Income_Ratio,0.003809
6,Income_Stability_Encoded,0.002783
5,Debt_to_Income_Ratio,0.001871


### Real testing data

In [140]:
# Predicting the real test data
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
0,70607,LP002560,1,1,0,1,0,15890,871.075952,188,371,1,1,6000
1,58412,LP001379,1,1,0,0,1,6582,896.718887,17,373,0,1,6000
2,88755,LP002560,0,0,0,0,0,7869,572.900354,17,373,1,1,6000
3,97271,LP002560,1,1,0,0,0,150,0.0,247,349,1,2,6000
4,70478,LP002231,1,1,0,0,0,8362,0.0,17,12,1,2,3750


In [141]:
df_test['Dependents'] = df_test['Dependents'].str.replace('3+', '3').astype('int64')
df_test['Loan_ID'] = df_test['Loan_ID'].str.replace('LP', '').astype('int64')

In [142]:
# Create new features for the test data

# 1. Loan-to-Income Ratio: Measures the proportion of loan amount to the applicant's income.
df_test['Loan_to_Income_Ratio'] = df_test['LoanAmount'] / (df_test['ApplicantIncome'] + 1)  
# Add 1 to avoid division by zero

# 2. Total Income: Sum of applicant and co-applicant incomes (if applicable).
df_test['Total_Income'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']

# 3. Income Stability Indicator: Categorical variable based on income brackets (low, medium, high).
income_bins = [0, 2500, 4000, 6000, float('inf')]
income_labels = ['Low', 'Medium', 'High', 'Very High']
df_test['Income_Stability'] = pd.cut(df_test['ApplicantIncome'], 
                                     bins=income_bins, labels=income_labels)

# Encoding categorical feature for Income_Stability
df_test['Income_Stability_Encoded'] = df_test['Income_Stability'].cat.codes

# Compute the dependents income ratio

# Replace NaN or 0 values in Dependents with 1 to avoid division by zero
df_test['Dependents'] = df_test['Dependents'].replace(0, 1)
df_test['Dependents_Income_Ratio'] = df_test['Total_Income'] / df_test['Dependents']

# Verify the new features
print(df_test[['Loan_to_Income_Ratio', 'Total_Income', 'Income_Stability', 
               'Income_Stability_Encoded']].head())


   Loan_to_Income_Ratio  Total_Income Income_Stability  \
0              0.011831  16761.075952        Very High   
1              0.002582   7478.718887        Very High   
2              0.002160   8441.900354        Very High   
3              1.635762    150.000000              Low   
4              0.002033   8362.000000        Very High   

   Income_Stability_Encoded  
0                         3  
1                         3  
2                         3  
3                         0  
4                         3  


In [144]:
# Handle missing or zero values in 'LoanAmount' and 'ApplicantIncome' before calculation
df_test['ApplicantIncome'] = df_test['ApplicantIncome'].replace(0, 1)  # Avoid division by zero

# Compute Debt-to-Income Ratio (DTI)
df_test['Debt_to_Income_Ratio'] = df_test['LoanAmount'] / (df_test['ApplicantIncome'] + df_test['CoapplicantIncome'])

# Replace any NaN or inf values in the new feature (edge cases)
df_test['Debt_to_Income_Ratio'].replace([float('inf'), -float('inf')], 0, inplace=True)
df_test['Debt_to_Income_Ratio'].fillna(0, inplace=True)

In [145]:
df_test_now = df_test[['ApplicantIncome', 'Loan_to_Income_Ratio', 'Income_Stability', 'Married',
             'LoanAmount', 'Total_Income', 'Debt_to_Income_Ratio', 'Income_Stability_Encoded',
             'Self_Employed',
             'Dependents_Income_Ratio', 'Dependents'
                      ]]
df_test_now

Unnamed: 0,ApplicantIncome,Loan_to_Income_Ratio,Income_Stability,Married,LoanAmount,Total_Income,Debt_to_Income_Ratio,Income_Stability_Encoded,Self_Employed,Dependents_Income_Ratio,Dependents
0,15890,0.011831,Very High,1,188,16761.075952,0.011216,3,0,16761.075952,1
1,6582,0.002582,Very High,1,17,7478.718887,0.002273,3,1,7478.718887,1
2,7869,0.002160,Very High,0,17,8441.900354,0.002014,3,0,8441.900354,1
3,150,1.635762,Low,1,247,150.000000,1.646667,0,0,150.000000,1
4,8362,0.002033,Very High,1,17,8362.000000,0.002033,3,0,8362.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
2523,7276,0.023086,Very High,1,168,7276.000000,0.023090,3,0,7276.000000,1
2524,8153,0.002085,Very High,1,17,9738.948119,0.001746,3,1,9738.948119,1
2525,16382,0.001038,Very High,1,17,16982.813513,0.001001,3,0,16982.813513,1
2526,7791,0.010010,Very High,1,78,7791.000000,0.010012,3,0,7791.000000,1


In [146]:
df_test_now_cat = pd.get_dummies(df_test_now).astype(int)
df_test_now_cat.head()

Unnamed: 0,ApplicantIncome,Loan_to_Income_Ratio,Married,LoanAmount,Total_Income,Debt_to_Income_Ratio,Income_Stability_Encoded,Self_Employed,Dependents_Income_Ratio,Dependents,Income_Stability_Low,Income_Stability_Medium,Income_Stability_High,Income_Stability_Very High
0,15890,0,1,188,16761,0,3,0,16761,1,0,0,0,1
1,6582,0,1,17,7478,0,3,1,7478,1,0,0,0,1
2,7869,0,0,17,8441,0,3,0,8441,1,0,0,0,1
3,150,1,1,247,150,1,0,0,150,1,1,0,0,0
4,8362,0,1,17,8362,0,3,0,8362,1,0,0,0,1


In [147]:
# applying normalization
scaler = StandardScaler()
df_test_scaled = scaler.fit_transform(df_test_now_cat)
len(df_test_scaled)

2528

In [148]:
# RF prediction
rf_pred_test = rf_model.predict(df_test_scaled)

In [149]:
rf_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

In [150]:
## ANN prediction

ann_pred_test = (model1.predict(df_test_scaled) > 0.5).astype(int).flatten()

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [151]:
ann_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### Rotation Forest Prediction

In [152]:
num_subsets = 3
subset_size = df_test_scaled.shape[1] // num_subsets
rotated_features_real_test = []

for i in range(num_subsets):
    # Select a subset of features
    subset_indices = np.random.choice(df_test_scaled.shape[1], subset_size, replace=False)
    X_real_test_subset = df_test_scaled[:, subset_indices]

    # Apply PCA to the subset of features
    rotated_real_test = pca.transform(X_real_test_subset)
    
    # Store the rotated features
    rotated_features_real_test.append(rotated_real_test)
# Concatenate all rotated features
X_real_test_rotated = np.hstack(rotated_features_real_test)

In [153]:
# Prediction the real testing dataset
pca_pred_test = rotation_forest_model.predict(X_real_test_rotated)

In [154]:
pca_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [155]:
stack_pred_test = stack_clf.predict(df_test_scaled)

In [156]:
stack_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [157]:
nb_pred_test = nb_model.predict(df_test_scaled)

In [158]:
nb_pred_test[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### Saving the predictions csv file

In [173]:
predictions = pd.DataFrame({'ID': df_test['ID'], 
                                  'Predicted_Status': nb_pred_test})
predictions.head()

Unnamed: 0,ID,Predicted_Status
0,70607,1
1,58412,1
2,88755,1
3,97271,1
4,70478,1


In [174]:
# Saving the prediction in a csv file format
predictions.to_csv('predictions_nw_cln1.csv', index=False)