In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('loan_data_set.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
# Handle missing values
df.fillna(method='ffill', inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'], drop_first=True)
df

Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,LP001002,5849,0.0,,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,LP001003,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,LP001005,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,LP001006,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,LP001008,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,2900,0.0,71.0,360.0,1.0,0,0,0,0,0,0,0,0,0,1
610,LP002979,4106,0.0,40.0,180.0,1.0,1,1,0,0,1,0,0,0,0,1
611,LP002983,8072,240.0,253.0,360.0,1.0,1,1,1,0,0,0,0,0,1,1
612,LP002984,7583,0.0,187.0,360.0,1.0,1,1,0,1,0,0,0,0,1,1


In [3]:
# Split the data into features and target
X = df.drop('Loan_Status_Y', axis=1)
y = df['Loan_Status_Y']

In [4]:
# Splitting Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Model Selection
from sklearn.ensemble import RandomForestClassifier

In [14]:
# Initialize the model
model = RandomForestClassifier()

In [15]:
from sklearn.impute import SimpleImputer

# Create an imputer object with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform X_train
X_train_imputed = imputer.fit_transform(X_train)

# Fit the model on the imputed data
model.fit(X_train_imputed, y_train)

In [16]:
#Model Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Perform one-hot encoding on categorical variables in X_train and X_test
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Align the columns of X_test_encoded with X_train_encoded
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predict using the preprocessed test data
y_pred = model.predict(X_test_encoded)



In [20]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8048780487804879


In [21]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.47      0.62        43
           1       0.77      0.99      0.87        80

    accuracy                           0.80       123
   macro avg       0.86      0.73      0.75       123
weighted avg       0.84      0.80      0.78       123

