In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
# Load datasets
train_df = pd.read_csv('/content/Training Dataset.csv')
test_df = pd.read_csv('/content/Test Dataset.csv')
sample_submission = pd.read_csv('/content/Sample_Submission.csv')


In [3]:
# Data Preprocessing and Feature Engineering

# Separate target variable from training data
y = train_df['Loan_Status']
train_df.drop(['Loan_Status'], axis=1, inplace=True)

# Concatenate train and test data for consistent preprocessing
data = pd.concat([train_df, test_df], sort=False)

# Handle missing values
numerical_features = data.select_dtypes(include=[np.number]).columns
categorical_features = data.select_dtypes(include=[object]).columns

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

data[numerical_features] = num_imputer.fit_transform(data[numerical_features])
data[categorical_features] = cat_imputer.fit_transform(data[categorical_features])

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Feature scaling
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [4]:
# Split back into train and test sets
train_df = data.iloc[:len(y), :]
test_df = data.iloc[len(y):, :]

# Split the training data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, test_size=0.2, random_state=42)

In [6]:
# Define the model and hyperparameter grid for tuning
model = GradientBoostingClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Hyperparameter tuning with GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

In [7]:
# Model evaluation
y_pred = best_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
error_percentage = (1 - accuracy) * 100

print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(f'Validation Error Percentage: {error_percentage:.2f}%')

# Train on the full training set and predict on the test set
best_model.fit(train_df, y)
test_predictions = best_model.predict(test_df)

# Create submission file
submission = sample_submission.copy()
submission['Loan_Status'] = test_predictions
submission.to_csv('submission.csv', index=False)

Validation Accuracy: 78.86%
Validation Error Percentage: 21.14%
