In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

In [9]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [10]:
app_df = pd.read_csv('application_record.csv')
credit_df = pd.read_csv('credit_record.csv')
print(app_df.head())
print(credit_df.head())

        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   Rented apartment      -12005 

In [11]:
# Clean Application Data
# Example: drop duplicates
app_df = app_df.drop_duplicates(subset='ID')
# Handle missing values (option: fill with median/mode OR drop)
app_df = app_df.fillna(app_df.median(numeric_only=True))
for col in app_df.select_dtypes('object'):
    app_df[col] = app_df[col].fillna(app_df[col].mode()[0])
# Encode categorical variables
app_df = pd.get_dummies(app_df, drop_first=True)

In [12]:
# Converting STATUS to numeric "bad debt" flag
credit_df['BAD_FLAG'] = credit_df['STATUS'].apply(
    lambda x: 1 if x in ['2','3','4','5'] else 0
)
# Aggregate for each customer: max DPD, number of bad months, etc.
credit_summary = credit_df.groupby('ID').agg(
    bad_months = ('BAD_FLAG', 'sum'),
    max_status = ('STATUS', lambda x: max(pd.to_numeric(x, errors='coerce').fillna(0))),
    months_record = ('MONTHS_BALANCE', 'count')
).reset_index()
print(credit_summary.head())

        ID  bad_months  max_status  months_record
0  5001711           0         0.0              4
1  5001712           0         0.0             19
2  5001713           0         0.0             22
3  5001714           0         0.0             15
4  5001715           0         0.0             60


In [13]:
merged_df = pd.merge(app_df, credit_summary, on='ID', how='inner')
merged_df['TARGET'] = merged_df['bad_months'].apply(lambda x: 1 if x > 0 else 0)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Final encoding
print(merged_df.dtypes[merged_df.dtypes == 'object'])  # Check categorical cols
merged_df = pd.get_dummies(merged_df, drop_first=True)
# Features & target
X = merged_df.drop(['ID', 'TARGET'], axis=1)
y = merged_df['TARGET']
# Train-test split BEFORE scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
scaler.fit(X_train)  # fit only on training data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Series([], dtype: object)


In [15]:
# Train Model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

In [16]:
# Evaluate Model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
ROC-AUC: 1.0
Confusion Matrix:
 [[7175    0]
 [   0  117]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7175
           1       1.00      1.00      1.00       117

    accuracy                           1.00      7292
   macro avg       1.00      1.00      1.00      7292
weighted avg       1.00      1.00      1.00      7292



In [42]:
scaler = StandardScaler()
scaler.fit(X_train)  # fit only on training data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

In [44]:
# Train on arrays (convert DataFrames to NumPy before fitting)
scaler = StandardScaler()
scaler.fit(X_train.values)  # values = array, no column names
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)
# Predict for new applicant (array only)
sample_applicant = X.iloc[0].values.reshape(1, -1)  # returns array directly
sample_applicant_scaled = scaler.transform(sample_applicant)
prediction = model.predict(sample_applicant_scaled)[0]
prob = model.predict_proba(sample_applicant_scaled)[0][1]
print("New Applicant Prediction:", "High Risk" if prediction == 1 else "Low Risk")
print(f"Probability of High Risk: {prob:.2%}")

New Applicant Prediction: Low Risk
Probability of High Risk: 0.17%


In [48]:
# Scale training data with feature names
scaler = StandardScaler()
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
new_applicant = pd.DataFrame([X.iloc[0].values], columns=X.columns)
# Scale applicant's data
new_applicant_scaled = scaler.transform(new_applicant)
# Predict probability for High Risk (class 1)
high_risk_prob = model.predict_proba(new_applicant_scaled)[0][1]
# Set a custom threshold for High Risk decision
threshold = 0.20  # e.g., approve if risk < 20%
# Decision rule
decision = "Reject (High Risk)" if high_risk_prob >= threshold else "Approve (Low Risk)"
# Output results
print(f"Decision @ {threshold*100:.0f}% threshold: {decision}")

Decision @ 20% threshold: Approve (Low Risk)


In [25]:
# Comparing model performance on the same test set
def evaluate_model(name, model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    print(f"\n{name}")
    print("-" * len(name))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=500, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)
# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)
# XGBoost
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
# Call for each model you trained above
evaluate_model("Logistic Regression", log_reg_model, X_test_scaled, y_test)
evaluate_model("Random Forest", rf_model, X_test_scaled, y_test)
evaluate_model("XGBoost", xgb_model, X_test_scaled, y_test)


Logistic Regression
-------------------
Accuracy: 1.0
ROC-AUC: 1.0

Random Forest
-------------
Accuracy: 1.0
ROC-AUC: 1.0

XGBoost
-------
Accuracy: 1.0
ROC-AUC: 1.0


In [26]:
# Unified prediction with custom threshold
def predict_with_threshold(model, scaler, X_columns, applicant_data, threshold=0.20):
    """Predicts risk probability and decision for a new applicant."""
    applicant_df = pd.DataFrame([applicant_data], columns=X_columns)
    applicant_scaled = scaler.transform(applicant_df)
    prob = model.predict_proba(applicant_scaled)[0][1]
    decision = "Reject (High Risk)" if prob >= threshold else "Approve (Low Risk)"
    return decision, prob
# Example: use first row of X as test applicant
applicant_values = list(X.iloc[0].values)
# Choose which model to use
chosen_model = rf_model  # or log_reg_model / xgb_model
decision, prob = predict_with_threshold(chosen_model, scaler, X.columns, applicant_values, threshold=0.20)
print(f"\n📊 Chosen Model: {chosen_model.__class__.__name__}")
print(f"Probability of High Risk: {prob:.2%}")
print(f"Decision: {decision}")


📊 Chosen Model: RandomForestClassifier
Probability of High Risk: 0.50%
Decision: Approve (Low Risk)


In [52]:
# import pandas as pd
def predict_with_threshold(model, scaler, X_columns, threshold=0.20):
    print("\n=== New Applicant Risk Checker ===")
    applicant_data = []
    # Prompt user for each feature
    for col in X_columns:
        val = float(input(f"Enter value for {col}: "))
        applicant_data.append(val)
    # Create DataFrame
    applicant_df = pd.DataFrame([applicant_data], columns=X_columns)
    # Scale
    applicant_scaled = scaler.transform(applicant_df)
    # Predict
    prob = model.predict_proba(applicant_scaled)[0][1]
    decision = "Reject (High Risk)" if prob >= threshold else "Approve (Low Risk)"
    # Results
    print(f"\nProbability of High Risk: {prob:.2%}")
    print(f"Decision @ {threshold*100:.0f}% threshold: {decision}")
# Example usage for Random Forest
predict_with_threshold(rf_model, scaler, X.columns, threshold=0.20)


=== New Applicant Risk Checker ===


Enter value for CNT_CHILDREN:  1
Enter value for AMT_INCOME_TOTAL:  238500
Enter value for DAYS_BIRTH:  1194
Enter value for DAYS_EMPLOYED:  3500
Enter value for FLAG_MOBIL:  1
Enter value for FLAG_WORK_PHONE:  0
Enter value for FLAG_PHONE:  1
Enter value for FLAG_EMAIL:  1
Enter value for CNT_FAM_MEMBERS:  1
Enter value for CODE_GENDER_M:  1
Enter value for FLAG_OWN_CAR_Y:  1
Enter value for FLAG_OWN_REALTY_Y:  1
Enter value for NAME_INCOME_TYPE_Pensioner:  0
Enter value for NAME_INCOME_TYPE_State servant:  0
Enter value for NAME_INCOME_TYPE_Student:  1
Enter value for NAME_INCOME_TYPE_Working:  1
Enter value for NAME_EDUCATION_TYPE_Higher education:  1
Enter value for NAME_EDUCATION_TYPE_Incomplete higher:  0
Enter value for NAME_EDUCATION_TYPE_Lower secondary:  0
Enter value for NAME_EDUCATION_TYPE_Secondary / secondary special:  0
Enter value for NAME_FAMILY_STATUS_Married:  0
Enter value for NAME_FAMILY_STATUS_Separated:  0
Enter value for NAME_FAMILY_STATUS_Single / not married: 


Probability of High Risk: 92.00%
Decision @ 20% threshold: Reject (High Risk)


In [55]:
import joblib
# Save model, scaler, and feature column names
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(X.columns), "feature_columns.pkl")

['feature_columns.pkl']