<a href="https://colab.research.google.com/github/Arshdeep-Yadav/credit-risk-analysis/blob/main/credit_risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adarshsng/lending-club-loan-data-csv")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/adarshsng/lending-club-loan-data-csv?dataset_version_number=1...


100%|██████████| 339M/339M [00:02<00:00, 155MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/adarshsng/lending-club-loan-data-csv/versions/1


In [None]:
import os
import pandas as pd

# List files in dataset directory
os.listdir(path)

['LCDataDictionary.xlsx', 'loan.csv']

In [None]:
file_path = os.path.join(path,"loan.csv")
df = pd.read_csv(file_path, low_memory=False)
df.shape

(2260668, 145)

In [None]:
print("Available columns in the DataFrame:")
print(df.columns.tolist())

Available columns in the DataFrame:
['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', '

In [None]:
requested_cols = [
    "loan_amnt",
    "annual_inc",
    "int_rate",
    "dti",
    "delinq_2yrs",
    "open_acc",
    "grade",
    "loan_status"
]

final_cols = [c for c in requested_cols if c in df.columns]

print("Using columns:", final_cols)

df_model = df[final_cols].copy()

Using columns: ['loan_amnt', 'annual_inc', 'int_rate', 'dti', 'loan_status']


In [None]:
if "grade" in df_model.columns:
    grade_mapping = {"A":1,"B":2,"C":3,"D":4,"E":5,"F":6,"G":7}
    df_model["grade_num"] = df_model["grade"].map(grade_mapping)
    df_model.drop("grade", axis=1, inplace=True)

In [None]:
valid_status = [
    "Fully Paid",
    "Charged Off",
    "Default",
    "Late (31-120 days)"
]

df_model = df_model[df_model["loan_status"].isin(valid_status)]

df_model["default"] = (df_model["loan_status"] != "Fully Paid").astype(int)
df_model.drop("loan_status", axis=1, inplace=True)

df_model.dropna(inplace=True)
df_model.reset_index(drop=True, inplace=True)

In [None]:
print(df_model.columns.tolist())
print(df_model.head())
print(df_model.dtypes)

['loan_amnt', 'annual_inc', 'int_rate', 'dti', 'default']
   loan_amnt  annual_inc  int_rate    dti  default
0      30000    100000.0     22.35  30.46        0
1      40000     45000.0     16.14  50.53        0
2      20000    100000.0      7.56  18.92        0
3       4500     38500.0     11.31   4.64        0
4       8425    450000.0     27.27  12.37        0
loan_amnt       int64
annual_inc    float64
int_rate      float64
dti           float64
default         int64
dtype: object


In [None]:
list(df_model.columns)

['loan_amnt', 'annual_inc', 'int_rate', 'dti', 'default']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X = df_model.drop("default", axis=1)
y = df_model["default"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, y_prob)

print("ROC-AUC:", roc)

ROC-AUC: 0.6883913718630645


In [None]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
})

intercept = model.intercept_[0]

print(coef_df)
print("Intercept:", intercept)

      feature  coefficient
0   loan_amnt     0.132738
1  annual_inc    -0.196206
2    int_rate     0.576956
3         dti     0.130706
Intercept: -1.4162572357921508


In [None]:
coef_df.to_csv("lr_coefficients.csv", index=False)

SyntaxError: invalid syntax (ipython-input-1088058822.py, line 1)