In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [5]:
# Load the datasets
train_data = pd.read_csv(r'dataset/TrainingDataset.csv')
test_data = pd.read_csv(r'dataset/TestDataset.csv')

In [6]:
# Handle missing values
# For categorical variables, fill missing values with the mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].mode()[0], inplace=True)


In [7]:
# For numerical variables, fill missing values with the median
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].median(), inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace metho

In [8]:
# Encode categorical variables
train_data_encoded = pd.get_dummies(train_data.drop(columns=['Loan_ID']), drop_first=True)
test_data_encoded = pd.get_dummies(test_data.drop(columns=['Loan_ID']), drop_first=True)

In [9]:
# Align the train and test data
X_train = train_data_encoded.drop(columns=['Loan_Status_Y'])
y_train = train_data_encoded['Loan_Status_Y']
X_train, test_data_encoded = X_train.align(test_data_encoded, join='inner', axis=1, fill_value=0)

In [10]:
# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [11]:
# Define the model
model = RandomForestClassifier(random_state=42)

In [12]:
# Train the model
model.fit(X_train_split, y_train_split)

In [13]:
# Evaluate the model using cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=kf)
print(f'Cross-validated Accuracy: {cv_scores.mean()}')

Cross-validated Accuracy: 0.7883113421298147


In [14]:
# Evaluate the model on the validation set
val_preds = model.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, val_preds)
print(f'Validation Accuracy: {val_accuracy}')

Validation Accuracy: 0.7804878048780488


In [15]:
# Train the model on the full training set
model.fit(X_train, y_train)

In [16]:
# Predict on the test data
test_preds = model.predict(test_data_encoded)

In [18]:
# Prepare the submission
submission = pd.DataFrame({'Loan_ID': test_data['Loan_ID'], 'Loan_Status': np.where(test_preds == 1, 'Y', 'N')})
submission.to_csv('submission.csv', index=False)