In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load datasets
train_data = pd.read_csv("C:/projects/loan approval or rejection prediction/Assignment_Train.csv")
test_data = pd.read_csv("C:/projects/loan approval or rejection prediction/Assignment_Test.csv")


In [3]:
# Data Exploration
print("Training Data Information:")
print(train_data.info())
print("\nTraining Data Head:")
print(train_data.head())
print("\nTest Data Information:")
print(test_data.info())
print("\nTest Data Head:")
print(test_data.head())

Training Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   DEALER ID                            10000 non-null  int64  
 1   APPLICATION LOGIN DATE               10000 non-null  object 
 2   HDB BRANCH NAME                      9999 non-null   object 
 3   HDB BRANCH STATE                     9146 non-null   object 
 4   FIRST NAME                           10000 non-null  object 
 5   MIDDLE NAME                          2855 non-null   object 
 6   LAST NAME                            9319 non-null   object 
 7   mobile                               10000 non-null  int64  
 8   AADHAR VERIFIED                      10000 non-null  object 
 9   Cibil Score                          5703 non-null   object 
 10  MOBILE VERIFICATION                  10000 non-null  bool   
 11  DE

In [4]:
# ETL Process
# Extract: Features and Target Variable
X = train_data.drop(columns=['Application Status'])
y = train_data['Application Status']

In [5]:
# Transform: Preprocess data
# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [6]:
# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [7]:
# Apply transformations to training and test data
X_preprocessed = preprocessor.fit_transform(X)
test_data_preprocessed = preprocessor.transform(test_data)


In [8]:
# Load: Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [9]:
# Model 1: Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)

In [10]:
# Evaluate Random Forest Classifier
accuracy_rf = accuracy_score(y_val, y_pred_rf)
precision_rf = precision_score(y_val, y_pred_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_rf, average='weighted')

print(f"\nRandom Forest Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest Precision: {precision_rf:.4f}")
print(f"Random Forest Recall: {recall_rf:.4f}")
print(f"Random Forest F1 Score: {f1_rf:.4f}")

# Model 2: Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb = gb_clf.predict(X_val)

# Evaluate Gradient Boosting Classifier
accuracy_gb = accuracy_score(y_val, y_pred_gb)
precision_gb = precision_score(y_val, y_pred_gb, average='weighted')
recall_gb = recall_score(y_val, y_pred_gb, average='weighted')
f1_gb = f1_score(y_val, y_pred_gb, average='weighted')

print(f"\nGradient Boosting Accuracy: {accuracy_gb:.4f}")
print(f"Gradient Boosting Precision: {precision_gb:.4f}")
print(f"Gradient Boosting Recall: {recall_gb:.4f}")
print(f"Gradient Boosting F1 Score: {f1_gb:.4f}")


Random Forest Accuracy: 0.8295
Random Forest Precision: 0.8777
Random Forest Recall: 0.8295
Random Forest F1 Score: 0.8342

Gradient Boosting Accuracy: 0.8365
Gradient Boosting Precision: 0.8671
Gradient Boosting Recall: 0.8365
Gradient Boosting F1 Score: 0.8407


In [11]:
# Compare Model Performance
if f1_rf > f1_gb:
    print("\nRandom Forest performs better based on F1 Score.")
else:
    print("\nGradient Boosting performs better based on F1 Score.")

# Predict on the test set
y_test_pred_rf = rf_clf.predict(test_data_preprocessed)
y_test_pred_gb = gb_clf.predict(test_data_preprocessed)

# Create submission files
submission_rf = pd.DataFrame({'UID': test_data.index, 'Predicted': y_test_pred_rf})
submission_rf.to_csv('submission_rf.csv', index=False)

submission_gb = pd.DataFrame({'UID': test_data.index, 'Predicted': y_test_pred_gb})
submission_gb.to_csv('submission_gb.csv', index=False)


Gradient Boosting performs better based on F1 Score.
