In [18]:
# 1️⃣ Import libraries
import pandas as pd
import os
import requests  # Added for downloading data

# Define URL and save_path based on kernel state
url = 'https://raw.githubusercontent.com/Abre1234/credit-risk-xai/main/data/raw/credit_risk_dataset.csv'
save_path = 'data/raw/credit_risk_dataset.csv'

# 2️⃣ Load dataset
# Create 'data/raw' directory if it doesn't exist
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Download the file if it doesn't exist
if not os.path.exists(save_path):
    print(f"Downloading data from {url} to {save_path}...")
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    with open(save_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")
else:
    print(f"File already exists at {save_path}. Skipping download.")

file_path = save_path  # Use the path where the downloaded file is saved
data = pd.read_csv(file_path)

# 3️⃣ Quick inspection
print("Shape of dataset:", data.shape)
print("\nFirst 5 rows:\n", data.head())
print("\nColumns:\n", data.columns)

# 4️⃣ Clean column names (lowercase, replace spaces)
data.columns = [col.lower().replace(' ', '_') for col in data.columns]

# 5️⃣ Check target column
# Assuming target column is named 'class' or similar
# Rename it to 'target' for consistency
# Correcting the target column name based on actual data
if 'loan_status' in data.columns:
    data = data.rename(columns={'loan_status': 'target'})

# 6️⃣ Optional: Check missing values
print("\nMissing values per column:\n", data.isnull().sum())

# 7️⃣ Quick distribution of target
print("\nTarget value counts:\n", data['target'].value_counts())

# 8️⃣ Create processed folder if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# 9️⃣ Save cleaned dataset for EDA
processed_path = 'data/processed/credit_data.csv'
data.to_csv(processed_path, index=False)
print(f"\nProcessed dataset saved to: {processed_path}")

File already exists at data/raw/credit_risk_dataset.csv. Skipping download.
Shape of dataset: (32581, 12)

First 5 rows:
    person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_def

In [19]:

import pandas as pd
data = pd.read_csv('data/processed/credit_data.csv')
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,target,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
