In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import StackingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

In [2]:
df=pd.read_csv(r"/content/Train.csv")
df_test=pd.read_csv(r"/content/Test.csv")

In [3]:
df.columns

Index(['ID', 'Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Total_Income'],
      dtype='object')

In [4]:
df['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,4913
0,985


In [5]:
ntrain=df.shape[0]
ntest=df_test.shape[0]
y=df.Loan_Status
df= pd.concat((df,df_test) ).reset_index(drop=True)
df.drop(["Loan_Status"],axis=1,inplace=True)

In [6]:
df.describe()

Unnamed: 0,ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
count,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0,8426.0
mean,55079.633515,0.912414,0.848208,0.119748,0.106338,7761.107762,1251.066896,97.129718,359.291004,0.92262,1.192974,5224.093995
std,26122.137744,0.282709,0.358841,0.324686,0.308287,9026.499548,2344.336445,103.96457,48.732943,0.267209,0.725968,2287.255588
min,10001.0,0.0,0.0,0.0,0.0,150.0,0.0,17.0,12.0,0.0,0.0,1963.0
25%,32331.25,1.0,1.0,0.0,0.0,2367.25,0.0,17.0,357.0,1.0,1.0,3750.0
50%,55678.0,1.0,1.0,0.0,0.0,7001.0,132.726669,40.0,368.0,1.0,1.0,6000.0
75%,77741.5,1.0,1.0,0.0,0.0,8376.0,1723.638843,173.0,372.0,1.0,2.0,6000.0
max,99998.0,1.0,1.0,1.0,1.0,81000.0,20000.0,700.0,480.0,1.0,2.0,22500.0


In [7]:
import pandas as pd
import re




# Function to extract numbers using regular expressions
def extract_number(id_str):
    match = re.search(r'\d+', id_str)
    if match:
        return int(match.group())
    else:
        return None

# Apply the function to the 'ID' column and create a new column 'Number'
df['Loan_ID'] = df['Loan_ID'].apply(extract_number)

print(df)

         ID  Loan_ID  Gender  Married Dependents  Education  Self_Employed  \
0     74768     2231       1        1          0          1              0   
1     79428     1448       1        1          0          0              0   
2     70497     2231       0        0          0          0              0   
3     87480     1385       1        1          0          0              0   
4     33964     2231       1        1          1          0              0   
...     ...      ...     ...      ...        ...        ...            ...   
8421  15578     2175       1        1          0          0              0   
8422  87689     2582       1        1          0          0              1   
8423  42584     2231       1        1          0          0              0   
8424  44709     2224       1        1          0          0              0   
8425  75533     2139       0        1          0          1              0   

      ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amou

In [8]:
df['Dependents'].replace("3+", "3", inplace=True)
df["Dependents"]=df["Dependents"].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace("3+", "3", inplace=True)


In [9]:
df["ApplicantIncome"]=df["ApplicantIncome"].astype(int)
df["CoapplicantIncome"]=df["CoapplicantIncome"].astype(int)
df["LoanAmount"]=df["LoanAmount"].astype(int)
df["Loan_Amount_Term"]=df["Loan_Amount_Term"].astype(int)
df["Total_Income"]=df["Total_Income"].astype(int)

In [10]:
X=df[:ntrain]
df_test_C=df[ntrain:]

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)

In [12]:
df_test_a=pd.read_csv(r"/content/Test.csv")

In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')  # Output layer with sigmoid for binary classification
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model.fit(X, y, epochs=100, batch_size=32,
           callbacks=[early_stopping])

# Make predictions on the test data (df_test_C)
dl_prediction = model.predict(df_test_C)

# Converting predictions to 0 and 1 using a threshold
dl_prediction = (dl_prediction > 0.5).astype(int)

# Creating submission DataFrame
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': dl_prediction.flatten()
})


submission.to_csv('AASSAUUUQDAAdl_pred_deepAAA.csv', index=False)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6816 - loss: 1169.1322
Epoch 2/100
[1m 19/185[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.6815 - loss: 250.9127

  current = self.get_monitor_value(logs)


[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7143 - loss: 204.9512
Epoch 3/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7115 - loss: 56.4497
Epoch 4/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7502 - loss: 11.7546
Epoch 5/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7811 - loss: 5.2023
Epoch 6/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7776 - loss: 4.2435
Epoch 7/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7921 - loss: 1.7208
Epoch 8/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8070 - loss: 2.0557
Epoch 9/100
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7963 - loss: 2.2056
Epoch 10/100
[1m185/185[0m [32m━━━━━━

In [None]:
# !pip install catboost

In [17]:
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [18]:
catboost_model = cb.CatBoostClassifier(
    iterations=100,
    learning_rate=0.8,
    depth=2,
    l2_leaf_reg=1,
    bagging_temperature=1,
    random_strength=0.9109,
    border_count=18,
    model_size_reg=0.01,
    od_type='Iter',
    od_wait=20,
    thread_count=1,
    metric_period=20,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=300

)
catboost_model.fit(X_train, y_train)
catboost_predictions = catboost_model.predict(X_test)
catboost_accuracy = accuracy_score(y_test, catboost_predictions)
print("CatBoost Accuracy:", catboost_accuracy)

0:	total: 47.6ms	remaining: 4.71s
99:	total: 144ms	remaining: 0us
CatBoost Accuracy: 0.8398305084745763


In [19]:
# training with the whole X,y data
catboost_model.fit(X, y)
catboost_prediction = catboost_model.predict(df_test_C)

0:	total: 1.28ms	remaining: 127ms
99:	total: 130ms	remaining: 0us


In [20]:
df_test_a=pd.read_csv(r"/content/Test.csv")

In [21]:
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': catboost_prediction
})

# Saving the submission file in a csv format
submission.to_csv('AAANEWSmission_rf_tuned.csv', index=False)

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

In [23]:
# Initializing the model
rf_model = RandomForestClassifier(random_state=42)

# Training the model on the training data
rf_model.fit(X_train, y_train)


In [24]:
# Using the best parameters found from RandomizedSearchCV
rf_tuned = RandomForestClassifier(
    n_estimators=100,
    max_depth=7,
    min_samples_split=8,
    min_samples_leaf=9,
    max_features='sqrt',
    bootstrap=True,
    random_state=42
)


rf_tuned.fit(X_train, y_train)


y_pred_tuned_rf = rf_tuned.predict(X_test)


accuracy_rf_tuned = accuracy_score(y_test, y_pred_tuned_rf)
print("Tuned Random Forest Accuracy:", accuracy_rf_tuned)


Tuned Random Forest Accuracy: 0.8415254237288136


In [25]:
rf_tuned.fit(X, y)

In [26]:
# Making predictions on the test dataset
test_predictionss = rf_tuned.predict(df_test_C)

In [27]:
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': test_predictionss
})

# Saving the submission file in a csv format
submission.to_csv('Ax_mission.csv', index=False)

In [28]:
# Initializing the Logistic Regression model
log_model = LogisticRegression(max_iter=1000, random_state=42)


log_model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
test_predict = log_model.predict(df_test_C)

In [30]:
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': test_predict
})


submission.to_csv('Log000_mission_rf_rfe.csv', index=False)

In [31]:
svm_model = SVC(random_state=42)

# Train the model
svm_model.fit(X, y)
test_predicts = svm_model.predict(df_test_C)

In [32]:
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': test_predicts
})


submission.to_csv('Svm_11_mission_rf_rfe.csv', index=False)

In [33]:
# Base models
rf_tuned
catboost_model

# Stacking the base models with Logistic Regression as the meta-model
stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf_tuned),
        ('xgb', catboost_model)
    ],
    final_estimator=LogisticRegression(),
    passthrough=False  # If True, features will be concatenated with predictions
)

# Train the stacked model
stacking_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_stack = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stack)
print(f" {stacking_accuracy}")

0:	total: 997us	remaining: 98.7ms
99:	total: 87.7ms	remaining: 0us
0:	total: 684us	remaining: 67.7ms
99:	total: 50.7ms	remaining: 0us
0:	total: 559us	remaining: 55.4ms
99:	total: 49.4ms	remaining: 0us
0:	total: 547us	remaining: 54.2ms
99:	total: 51.6ms	remaining: 0us
0:	total: 479us	remaining: 47.4ms
99:	total: 49.4ms	remaining: 0us
0:	total: 528us	remaining: 52.3ms
99:	total: 49.5ms	remaining: 0us
 0.8415254237288136


In [34]:
stacking_model.fit(X, y)

0:	total: 1.28ms	remaining: 127ms
99:	total: 102ms	remaining: 0us
0:	total: 620us	remaining: 61.4ms
99:	total: 60.6ms	remaining: 0us
0:	total: 831us	remaining: 82.3ms
99:	total: 60.6ms	remaining: 0us
0:	total: 686us	remaining: 68ms
99:	total: 59.8ms	remaining: 0us
0:	total: 621us	remaining: 61.6ms
99:	total: 55.8ms	remaining: 0us
0:	total: 596us	remaining: 59ms
99:	total: 61.3ms	remaining: 0us


In [35]:
test_predictions_stacking = stacking_model.predict(df_test_C)

In [36]:
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': test_predictions_stacking
})


submission.to_csv('_stacking00_mission.csv', index=False)

In [37]:
# Base models
rf_tuned
catboost_model

# Stacking the base models with Logistic Regression as the meta-model
stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf_tuned),
        ('xgb', svm_model)
    ],
    final_estimator= catboost_model,
    passthrough=False  # If True, features will be concatenated with predictions
)

# Train the stacked model
stacking_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_stack = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stack)
print(f"Accuracy: {stacking_accuracy}")

0:	total: 505us	remaining: 50.1ms
99:	total: 42.9ms	remaining: 0us
Accuracy: 0.8415254237288136


In [38]:
stacking_model.fit(X, y)
test_predictions_stackings = stacking_model.predict(df_test_C)
submission = pd.DataFrame({
    'ID': df_test_a['ID'],
    'Loan_Status': test_predictions_stackings
})


submission.to_csv('atry.csv', index=False)

0:	total: 750us	remaining: 74.3ms
99:	total: 54.6ms	remaining: 0us


In [39]:
# since all have similar answers let me try a new approach

In [40]:
selector = SelectKBest(mutual_info_classif, k=7)
x_new = selector.fit_transform(X, y)

In [41]:
selected_features = selector.get_support(indices=True)
feature_names = X.columns[selected_features]
print(feature_names)

Index(['ID', 'Married', 'Self_Employed', 'ApplicantIncome', 'Credit_History',
       'Property_Area', 'Total_Income'],
      dtype='object')


In [42]:
f_scores, p_values = f_classif(X, y)
f_scores_df = pd.DataFrame({"Feature_Name": X.columns, "f_score": f_scores, "p_valies": p_values})
f_scores_df.sort_values(by="f_score", ascending=False, inplace=True)
print(f_scores_df)

         Feature_Name   f_score  p_valies
1             Loan_ID  5.217386  0.022398
12      Property_Area  4.264161  0.038968
4          Dependents  2.818668  0.093227
9          LoanAmount  1.259294  0.261830
0                  ID  0.896849  0.343667
5           Education  0.664232  0.415103
7     ApplicantIncome  0.497916  0.480445
10   Loan_Amount_Term  0.406105  0.523978
3             Married  0.387906  0.533426
8   CoapplicantIncome  0.293646  0.587914
6       Self_Employed  0.207881  0.648450
13       Total_Income  0.130243  0.718193
2              Gender  0.121408  0.727525
11     Credit_History  0.007568  0.930678


In [43]:
p_values_df = pd.Series(p_values, index = X.columns)
p_values_df.sort_values(ascending=False, inplace=True)
print(p_values_df)

Credit_History       0.930678
Gender               0.727525
Total_Income         0.718193
Self_Employed        0.648450
CoapplicantIncome    0.587914
Married              0.533426
Loan_Amount_Term     0.523978
ApplicantIncome      0.480445
Education            0.415103
ID                   0.343667
LoanAmount           0.261830
Dependents           0.093227
Property_Area        0.038968
Loan_ID              0.022398
dtype: float64


In [45]:
! pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [46]:
from scipy import stats
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif
from lazypredict.Supervised import LazyClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import StackingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

In [47]:
classifier = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, predictions=True)
models, predictions = classifier.fit(X_train, X_test, y_train, y_test)

 97%|█████████▋| 30/31 [00:19<00:00,  1.26it/s]

[LightGBM] [Info] Number of positive: 3920, number of negative: 798
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1342
[LightGBM] [Info] Number of data points in the train set: 4718, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.830861 -> initscore=1.591738
[LightGBM] [Info] Start training from score 1.591738


100%|██████████| 31/31 [00:20<00:00,  1.54it/s]


In [48]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LabelSpreading,0.76,0.52,0.52,0.75,3.27
NearestCentroid,0.52,0.52,0.52,0.58,0.09
LabelPropagation,0.75,0.52,0.52,0.75,2.56
KNeighborsClassifier,0.83,0.52,0.52,0.78,0.41
PassiveAggressiveClassifier,0.77,0.52,0.52,0.75,0.04
Perceptron,0.76,0.51,0.51,0.75,0.09
GaussianNB,0.84,0.5,0.5,0.77,0.05
QuadraticDiscriminantAnalysis,0.84,0.5,0.5,0.77,0.21
ExtraTreeClassifier,0.72,0.5,0.5,0.73,0.05
ExtraTreesClassifier,0.84,0.5,0.5,0.77,1.26


In [51]:
#Conclusion
# Most of the model result with high accuracy can be deploy for use in the financial sector for loan issues