In [1]:
import pandas as pd

In [2]:
loan_data = pd.read_csv("/content/train_loan (1).csv")

In [3]:
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


**Loading and Understanding the Dataset**

1. The target variable is Loan_Status, which we aim to predict.
2. Several columns have missing values (Gender, Married, Dependents, Self_Employed, LoanAmount, Loan_Amount_Term, Credit_History).
3. The dataset includes both categorical (e.g., Gender, Married, Education) and numerical (e.g., ApplicantIncome, LoanAmount) features.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [8]:
# Step 1: Handle missing values
# Categorical columns: Fill with mode
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
imputer_cat = SimpleImputer(strategy='most_frequent')
loan_data[categorical_cols] = imputer_cat.fit_transform(loan_data[categorical_cols])


In [9]:
# Numerical columns: Fill with mean
numerical_cols = ['LoanAmount', 'Loan_Amount_Term']
imputer_num = SimpleImputer(strategy='mean')
loan_data[numerical_cols] = imputer_num.fit_transform(loan_data[numerical_cols])



In [10]:
# Step 2: Encode categorical variables
label_encoder = LabelEncoder()
categorical_features = ['Gender', 'Married', 'Dependents', 'Education',
                        'Self_Employed', 'Property_Area', 'Loan_Status']
for col in categorical_features:
    loan_data[col] = label_encoder.fit_transform(loan_data[col])

In [11]:
# Step 3: Feature Scaling (for numerical data)
scaler = StandardScaler()
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
loan_data[numerical_features] = scaler.fit_transform(loan_data[numerical_features])



In [12]:
# Step 4: Prepare data for modeling
X = loan_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = loan_data['Loan_Status']



In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [14]:
# Display the processed dataset
loan_data.head(), X_train.shape, X_test.shape

(    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
 0  LP001002       1        0           0          0              0   
 1  LP001003       1        1           1          0              0   
 2  LP001005       1        1           0          0              1   
 3  LP001006       1        1           0          1              0   
 4  LP001008       1        0           0          0              0   
 
    ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
 0         0.072991          -0.554487    0.000000          0.279851   
 1        -0.134412          -0.038732   -0.219273          0.279851   
 2        -0.393747          -0.554487   -0.957641          0.279851   
 3        -0.462062           0.251980   -0.314547          0.279851   
 4         0.097728          -0.554487   -0.064454          0.279851   
 
   Credit_History  Property_Area  Loan_Status  
 0            1.0              2            1  
 1            1.0              0          

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
# Initialize models
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
ab_clf = AdaBoostClassifier(random_state=42)

In [17]:
# Train and evaluate each model
models = {'Random Forest': rf_clf, 'Gradient Boosting': gb_clf, 'AdaBoost': ab_clf}
results = {}


In [19]:
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Predict on test set
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {'Accuracy': accuracy, 'Classification Report': report}
results




{'Random Forest': {'Accuracy': 0.7723577235772358,
  'Classification Report': {'0': {'precision': 0.8571428571428571,
    'recall': 0.4186046511627907,
    'f1-score': 0.5625,
    'support': 43.0},
   '1': {'precision': 0.7549019607843137,
    'recall': 0.9625,
    'f1-score': 0.8461538461538461,
    'support': 80.0},
   'accuracy': 0.7723577235772358,
   'macro avg': {'precision': 0.8060224089635855,
    'recall': 0.6905523255813953,
    'f1-score': 0.7043269230769231,
    'support': 123.0},
   'weighted avg': {'precision': 0.7906447131698207,
    'recall': 0.7723577235772358,
    'f1-score': 0.746990306441526,
    'support': 123.0}}},
 'Gradient Boosting': {'Accuracy': 0.7642276422764228,
  'Classification Report': {'0': {'precision': 0.7916666666666666,
    'recall': 0.4418604651162791,
    'f1-score': 0.5671641791044776,
    'support': 43.0},
   '1': {'precision': 0.7575757575757576,
    'recall': 0.9375,
    'f1-score': 0.8379888268156425,
    'support': 80.0},
   'accuracy': 0.76

Observation: Random Forest and Gradient Boosting show better overall performance. However, hyperparameter tuning might improve these results further. Let's tune the Random Forest model using GridSearchCV to optimize its performance. ​​




In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
# Define hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [22]:
# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

In [23]:
# Fit GridSearchCV to the training data
grid_search_rf.fit(X_train, y_train)

In [24]:
# Best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

In [25]:
# Evaluate on test set
best_rf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_pred_best_rf)

In [26]:
best_params_rf, best_score_rf, test_accuracy_rf

({'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 50},
 0.8145537002679859,
 0.7886178861788617)

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [28]:
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],     # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]        # Minimum samples required in a leaf node
}


In [29]:
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42), # Base model
    param_grid=param_grid,                              # Parameter grid
    scoring='accuracy',                                 # Metric to evaluate
    cv=5,                                               # Number of folds for cross-validation
    n_jobs=-1                                           # Use all processors
)

# Fit the grid search to your training data
grid_search_rf.fit(X_train, y_train)


In [30]:
best_params = grid_search_rf.best_params_       # Best parameters
best_score = grid_search_rf.best_score_         # Cross-validated accuracy of the best model
print("Best Parameters:", best_params)
print("Best Cross-Validated Accuracy:", best_score)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validated Accuracy: 0.8145537002679859


In [31]:
best_rf = grid_search_rf.best_estimator_       # Retrieve the best model
y_pred = best_rf.predict(X_test)              # Predict on test data
test_accuracy = accuracy_score(y_test, y_pred) # Evaluate accuracy
print("Test Accuracy of Best Model:", test_accuracy)


Test Accuracy of Best Model: 0.7886178861788617
