In [179]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [180]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/bureauassignment1/Assignment_Train.csv')
test_df = pd.read_csv('/kaggle/input/bureauassignment1/Assignment_Test.csv')
feature_dict = pd.read_excel('/kaggle/input/bureauassignment1/Assignment_FeatureDictionary.xlsx')

In [181]:
test_df.drop(columns=['UID'], inplace=True)

In [182]:
X = train_df.drop(columns=['Application Status'])
y = train_df['Application Status']

In [183]:
if 'UID' in X.columns:
    X.drop(columns=['UID'], inplace=True)

In [184]:
imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

In [185]:
combined_df = pd.concat([X, test_df], axis=0)

In [186]:
label_encoders = {}
for column in combined_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_df[column] = le.fit_transform(combined_df[column].astype(str))
    label_encoders[column] = le

In [187]:
X = combined_df.iloc[:len(X), :]
test_df = combined_df.iloc[len(X):, :]

In [188]:
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])


In [189]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [190]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [191]:
y_pred_proba = rf.predict_proba(X_val)[:, 1]
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_proba))

Accuracy: 0.8805
Classification Report:
               precision    recall  f1-score   support

    APPROVED       0.90      0.92      0.91      1327
    DECLINED       0.84      0.79      0.82       673

    accuracy                           0.88      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.88      0.88      0.88      2000

ROC AUC Score: 0.9574557901891338


In [192]:
test_df['Application Status'] = rf.predict(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Application Status'] = rf.predict(test_df)


In [193]:
submission = pd.read_csv('/kaggle/input/bureauassignment1/Assignment_Test.csv')[['UID']]
submission['Application Status'] = test_df['Application Status']

In [194]:
submission.to_csv('submission.csv', index=False)

## 1. Hyperparameter Tuning with GridSearchCV

In [195]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [196]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
    'bootstrap': [True, False]
}

In [197]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=2, 
                           scoring='roc_auc')

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.6s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.7s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   6.8s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.3s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   4.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estim

In [203]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_val)
y_pred_proba = best_rf.predict_proba(X_val)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_proba))

Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Accuracy: 0.891
Classification Report:
               precision    recall  f1-score   support

    APPROVED       0.90      0.94      0.92      1327
    DECLINED       0.87      0.79      0.83       673

    accuracy                           0.89      2000
   macro avg       0.89      0.87      0.88      2000
weighted avg       0.89      0.89      0.89      2000

ROC AUC Score: 0.9593850880837022


In [205]:
X_test = test_df[X_train.columns]

In [206]:
test_df['Application Status'] = best_rf.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Application Status'] = best_rf.predict(X_test)


In [207]:
submission = pd.read_csv('/kaggle/input/bureauassignment1/Assignment_Test.csv')[['UID']]
submission['Application Status'] = test_df['Application Status']

In [208]:
submission.to_csv('submission.csv', index=False)