In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!pip install -q kaggle


In [None]:
import kagglehub

# Download dataset
path = kagglehub.dataset_download("wordsforthewise/lending-club")
print("Downloaded to:", path)


In [None]:
import os

print("Files inside:", os.listdir(path))


In [None]:
import pandas as pd

csv_path = os.path.join(path, 'accepted_2007_to_2018Q4.csv.gz')
df = pd.read_csv(csv_path, compression='gzip', low_memory=False)


In [None]:
print("Shape:", df.shape)
print("Columns:", df.columns[:10])
df.head()


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
null_threshold = 0.4
missing_ratios = df.isnull().mean()
cols_to_drop = missing_ratios[missing_ratios > null_threshold].index.tolist()

print("Dropping columns:", cols_to_drop)
df.drop(columns=cols_to_drop, inplace=True)


In [None]:
date_cols = ['issue_d', 'earliest_cr_line', 'last_credit_pull_d']

for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

df[date_cols].dtypes  # Check new types


In [None]:
cat_cols = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'purpose', 'application_type']

available_cat_cols = [col for col in cat_cols if col in df.columns]

df = pd.get_dummies(df, columns=available_cat_cols, drop_first=True)


In [None]:
def cap_outliers(series, lower=0.01, upper=0.99):
    q_low = series.quantile(lower)
    q_high = series.quantile(upper)
    return series.clip(q_low, q_high)

for col in ['annual_inc', 'dti']:
    if col in df.columns:
        df[col] = cap_outliers(df[col])


In [None]:
df.head()

In [None]:
df.info()
df.describe()



In [None]:
target_map = {
    'Charged Off': 1,
    'Default': 1,
    'Late (31-120 days)': 1,
    'Late (16-30 days)': 1,
    'In Grace Period': 1,
    'Does not meet the credit policy. Status:Charged Off': 1,
    'Fully Paid': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0
}


df['loan_status_clean'] = df['loan_status'].map(target_map)


df = df[~df['loan_status_clean'].isnull()]
df['loan_status_clean'] = df['loan_status_clean'].astype(int)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='loan_status_clean', data=df)
plt.title("Loan Status Distribution (0 = Good, 1 = Bad)")
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True)[['loan_status_clean']].sort_values(by='loan_status_clean', ascending=False), annot=True)
plt.title("Correlation with Target")
plt.show()


In [None]:
sns.boxplot(x='loan_status_clean', y='loan_amnt', data=df)
plt.title("Loan Amount vs Loan Status")
plt.show()


In [None]:
if 'fico_range_high' in df.columns:
    df['fico_avg'] = (df['fico_range_low'] + df['fico_range_high']) / 2
    sns.histplot(data=df, x='fico_avg', hue='loan_status_clean', bins=30, kde=True, element='step')
    plt.title("FICO Score Distribution by Loan Status")
    plt.show()


In [None]:
if 'purpose' in df.columns:
    purpose_default = df.groupby('purpose')['loan_status_clean'].mean().sort_values(ascending=False)
    purpose_default.plot(kind='bar', figsize=(10, 4), title="Default Rate by Loan Purpose")
    plt.ylabel("Mean Default Rate")
    plt.show()


Model

In [None]:
if 'fico_range_high' in df.columns and 'fico_range_low' in df.columns:
    df['fico_avg'] = (df['fico_range_low'] + df['fico_range_high']) / 2


In [None]:
if 'issue_d' in df.columns:
    df['issue_year'] = df['issue_d'].dt.year


In [None]:
if 'earliest_cr_line' in df.columns and 'issue_d' in df.columns:
    df['credit_age_months'] = (df['issue_d'] - df['earliest_cr_line']).dt.days // 30


In [None]:
if 'annual_inc' in df.columns and 'loan_amnt' in df.columns:
    df['income_loan_ratio'] = df['annual_inc'] / df['loan_amnt']


In [None]:
if 'emp_length' in df.columns:
    df['emp_length_clean'] = df['emp_length'].str.extract('(\d+)').fillna(0).astype(int)


In [None]:
drop_cols = ['emp_title', 'title', 'url', 'zip_code', 'id', 'member_id', 'policy_code', 'application_type', 'addr_state']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)


In [None]:
# Strip 'months' and convert to integer
if 'term' in df.columns:
    df['term'] = df['term'].str.extract('(\d+)').astype(float)


In [None]:
# Select only numeric columns for X
X = df.drop(columns=['loan_status_clean'])
X = X.select_dtypes(include=['number'])


In [None]:
def clean_emp_length(val):
    if pd.isnull(val) or val == 'n/a':
        return 0
    elif '<' in val:
        return 0.5
    elif '10+' in val:
        return 10
    else:
        try:
            return float(val.strip().split()[0])
        except:
            return 0

if 'emp_length' in df.columns:
    df['emp_length'] = df['emp_length'].apply(clean_emp_length)


In [None]:
# Fill numeric NaNs with median
df.fillna(df.median(numeric_only=True), inplace=True)


Model

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# Features and target
X = df.drop(columns=['loan_status_clean'])
X = X.select_dtypes(include=['number'])  # Only numeric features
y = df['loan_status_clean']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42, 
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1
)

xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1]))


In [None]:
print("ROC AUC Score:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1]))

In [None]:
importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
importances.nlargest(15).plot(kind='barh', title='Top 15 Important Features')
plt.show()


In [None]:
import shap
import matplotlib.pyplot as plt

# SHAP init
explainer = shap.Explainer(xgb_model, X_train)
shap_values = explainer(X_test)

# Global Feature Importance
shap.plots.bar(shap_values, max_display=15)


In [None]:
shap.plots.waterfall(shap_values[0])


In [None]:
joblib.dump(xgb_model, "xgboost_credit_model.pkl")
print("Boost model saved as 'xgboost_credit_model.pkl'")


In [None]:
from IPython.display import FileLink
FileLink(r'xgboost_credit_model.pkl')
