In [9]:
import pandas as pd

# 1. Load the original dataset
df = pd.read_csv("credit_train.csv")

# 2. Drop unnecessary ID columns
df.drop(['Loan ID', 'Customer ID'], axis=1, inplace=True)

# 3. Create a binary target variable: Fully Paid = 1, Others = 0
df['Loan_Status_Binary'] = df['Loan Status'].apply(lambda x: 1 if x == 'Fully Paid' else 0)

# 4. Drop the original 'Loan Status' column to avoid duplication
df.drop('Loan Status', axis=1, inplace=True)

# 5. Handle the 'Months since last delinquent' feature
# 5.1 Add a boolean column: whether the customer has ever been delinquent
df['Has_Delinquency'] = df['Months since last delinquent'].notna().astype(int)

# 5.2 Add 1 to all non-null values, replace NaNs with 0 (indicating never delinquent)
df['Months since last delinquent'] = df['Months since last delinquent'].apply(
    lambda x: x + 1 if pd.notna(x) else 0
)

# 6. Fill missing values
# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).drop(columns=['Loan_Status_Binary']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# 6.1 Fill missing numeric values with the median
for col in numeric_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

# 6.2 Fill missing categorical values with the mode
for col in categorical_cols:
    mode_val = df[col].mode()[0]
    df[col].fillna(mode_val, inplace=True)
    
# 7. Export the cleaned dataset to a new CSV file
df.to_csv("credit_cleaned.csv", index=False)
