In [1]:
# Loan Approval Prediction Model
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

In [4]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Display the first few rows of the training data
print(train.head())


   id  person_age  person_income person_home_ownership  person_emp_length  \
0   0          37          35000                  RENT                0.0   
1   1          22          56000                   OWN                6.0   
2   2          29          28800                   OWN                8.0   
3   3          30          70000                  RENT               14.0   
4   4          22          60000                  RENT                2.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0   EDUCATION          B       6000          11.49                 0.17   
1     MEDICAL          C       4000          13.35                 0.07   
2    PERSONAL          A       6000           8.90                 0.21   
3     VENTURE          B      12000          11.11                 0.17   
4     MEDICAL          A       6000           6.92                 0.10   

  cb_person_default_on_file  cb_person_cred_hist_length  loan_status  
0              

In [5]:
# Check for missing values in both datasets
print("Missing values in training data:\n", train.isnull().sum())
print("\nMissing values in test data:\n", test.isnull().sum())


Missing values in training data:
 id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

Missing values in test data:
 id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


In [6]:
# Impute missing values (using SimpleImputer)
imputer = SimpleImputer(strategy='mean')
# Assuming 'loan_amount' is a feature with missing values
if 'loan_amount' in train.columns:
    train['loan_amount'] = imputer.fit_transform(train[['loan_amount']])
if 'loan_amount' in test.columns:
    test['loan_amount'] = imputer.transform(test[['loan_amount']])

# Label Encoding for categorical features if any
label_encoder = LabelEncoder()
for column in ['category']:  # Add more columns if needed
    if column in train.columns:
        train[column] = label_encoder.fit_transform(train[column])
    if column in test.columns:
        test[column] = label_encoder.transform(test[column])


In [7]:
# Feature selection: Separate target and features
X = train.drop(['loan_status', 'id'], axis=1)
y = train['loan_status']


In [8]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Check for missing values in both datasets
print(train.isnull().sum())
print(test.isnull().sum())

# Impute missing values (using SimpleImputer) for numerical columns
imputer = SimpleImputer(strategy='mean')  # For numerical values
numerical_columns = ['loan_amount']  # Add other numerical columns if needed
for col in numerical_columns:
    if col in train.columns:
        train[col] = imputer.fit_transform(train[[col]])
        test[col] = imputer.transform(test[[col]])

# Handle categorical features with Label Encoding
# Find all categorical columns
categorical_columns = train.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()
for col in categorical_columns:
    train[col] = label_encoder.fit_transform(train[col].astype(str))
    test[col] = label_encoder.transform(test[col].astype(str))

# Feature selection: Separate target and features
X = train.drop(['loan_status', 'id'], axis=1)
y = train['loan_status']

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate model performance
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64
id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64
Validation Accuracy: 94.94%


In [10]:
# Prepare the test data
X_test = test.drop(['id'], axis=1)

# Make predictions
test_predictions = model.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({'id': test['id'], 'loan_status': test_predictions})

# Display the first few rows of the submission DataFrame
print(submission.head())


      id  loan_status
0  58645            1
1  58646            0
2  58647            1
3  58648            0
4  58649            0


In [11]:
# Save submission file
submission.to_csv('submission.csv', index=False)
# Display message after creating the file
print("Final submission file 'submission.csv' has been created.")

Final submission file 'submission.csv' has been created.
