In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the training and testing datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

le = LabelEncoder()
train_data['Gender'] = le.fit_transform(train_data['Gender'])
train_data['Married'] = le.fit_transform(train_data['Married'])
train_data['Education'] = le.fit_transform(train_data['Education'])
train_data['Self_Employed'] = le.fit_transform(train_data['Self_Employed'])
train_data['Property_Area'] = le.fit_transform(train_data['Property_Area'])
train_data['Loan_Status'] = le.fit_transform(train_data['Loan_Status'])
train_data['Dependents'] = train_data['Dependents'].replace('3+', 3)
train_data['Dependents'] = pd.to_numeric(train_data['Dependents'], errors='coerce')

# Handle missing values
for column in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Dependents']:
    train_data[column] = train_data[column].fillna(train_data[column].median())

# Data Preprocessing for Test Data
test_data['Gender'] = le.fit_transform(test_data['Gender'])
test_data['Married'] = le.fit_transform(test_data['Married'])
test_data['Education'] = le.fit_transform(test_data['Education'])
test_data['Self_Employed'] = le.fit_transform(test_data['Self_Employed'])
test_data['Property_Area'] = le.fit_transform(test_data['Property_Area'])
test_data['Dependents'] = test_data['Dependents'].replace('3+', 3)
test_data['Dependents'] = pd.to_numeric(test_data['Dependents'], errors='coerce')

# Handle missing values in test data
for column in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Dependents']:
    test_data[column] = test_data[column].fillna(test_data[column].median())

features = ['Gender', 'Married', 'Dependents', 'Education',
            'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
            'Loan_Amount_Term', 'Credit_History', 'Property_Area']
X = train_data[features]
y = train_data['Loan_Status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train the Random Forest Classifier with tuned parameters
rf_classifier = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'
)
rf_classifier.fit(X_train, y_train)

# Evaluate on the validation set to match the paper's metrics
y_val_pred = rf_classifier.predict(X_val)

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix (Validation Set):")
print(conf_matrix)

# Accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy of Random Forest classifier on validation set: {accuracy:.2f}")

# Classification Report
print("Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred))

# Feature Importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_classifier.feature_importances_})
print("Feature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))


rf_classifier.fit(X, y)
X_test = test_data[features]
y_test_pred = rf_classifier.predict(X_test)

Confusion Matrix (Validation Set):
[[ 35  23]
 [ 13 114]]
Accuracy of Random Forest classifier on validation set: 0.81
Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.73      0.60      0.66        58
           1       0.83      0.90      0.86       127

    accuracy                           0.81       185
   macro avg       0.78      0.75      0.76       185
weighted avg       0.80      0.81      0.80       185

Feature Importance:
              Feature  Importance
9      Credit_History    0.254722
5     ApplicantIncome    0.191430
7          LoanAmount    0.185343
6   CoapplicantIncome    0.118294
10      Property_Area    0.054710
2          Dependents    0.051390
8    Loan_Amount_Term    0.042105
1             Married    0.028862
3           Education    0.028586
4       Self_Employed    0.023714
0              Gender    0.020844
