# Credit Classification using Artificial Neural Networks

This project builds and evaluates a basic ANN model and compares it with other models to classify loan status as "Fully Paid" or "Charged Off".


In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_csv("credit_train.csv", na_values='?')

# Drop 'Loan ID' and 'Customer ID' columns
df = df.drop(columns=["Loan ID", "Customer ID"])

# Convert 'Loan Status' to binary (Fully Paid = 1, others = 0)
df["Loan Status"] = (df["Loan Status"] == "Fully Paid").astype(int)

# Define feature columns
cat_cols = ['Term', 'Years in current job', 'Home Ownership', 'Purpose']
int_cols = ['Credit Score', 'Number of Open Accounts', 'Number of Credit Problems', 'Bankruptcies', 'Tax Liens']
float_cols = ['Current Loan Amount', 'Annual Income', 'Monthly Debt', 'Years of Credit History', 'Months since last delinquent', 'Current Credit Balance', 'Maximum Open Credit']

# Convert categorical columns to 'category' type
for col in cat_cols:
    df[col] = df[col].astype('category')

# Convert integer columns to 'Int64' type and handle invalid values
for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')

# Convert float columns to numeric type
for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Process 'Months since last delinquent': Convert to binary (1 if overdue, 0 otherwise)
df['Months since last delinquent'] = df['Months since last delinquent'].apply(
    lambda x: 1 if pd.notnull(x) and x > 0 else 0
)

# Create modified column: Add 1 to all values, fill NaN with 0
df['Months since last delinquent (modified)'] = df['Months since last delinquent'] + 1

# Fix NaN fill issue: Fill NaN with 0
df['Months since last delinquent (modified)'] = df['Months since last delinquent (modified)'].fillna(0)

# Fill missing values in numerical columns with median
num_cols = int_cols + float_cols
imputer = SimpleImputer(strategy="median")
df[num_cols] = imputer.fit_transform(df[num_cols])

# Fill missing values in categorical columns with the most frequent value
for col in cat_cols:
    imputer = SimpleImputer(strategy="most_frequent")
    df[col] = imputer.fit_transform(df[[col]]).ravel()  # Use ravel() to convert the result to a 1D array

# Display basic statistics and the first few rows
print(df.describe())
print(df.head())




         Loan Status  Current Loan Amount   Credit Score  Annual Income  \
count  100514.000000         1.005140e+05  100514.000000   1.005140e+05   
mean        0.769654         1.170190e+07    1007.489514   1.338337e+06   
std         0.421056         3.171309e+07    1330.570078   9.731830e+05   
min         0.000000         1.080200e+04     585.000000   7.662700e+04   
25%         1.000000         1.799160e+05     711.000000   9.324250e+05   
50%         1.000000         3.122460e+05     724.000000   1.174162e+06   
75%         1.000000         5.232920e+05     738.000000   1.509968e+06   
max         1.000000         1.000000e+08    7510.000000   1.655574e+08   

        Monthly Debt  Years of Credit History  Months since last delinquent  \
count  100514.000000            100514.000000                 100514.000000   
mean    18460.895674                18.192498                      0.464045   
std     12144.885080                 6.997977                      0.498708   
min     

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Numerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine both pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Transform data
X_processed = preprocessor.fit_transform(X)



ValueError: Specifying the columns using strings is only supported for dataframes.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load the dataset (using Iris dataset as an example)
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data (scaling the features to zero mean and unit variance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the MLPClassifier model
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', 
                    max_iter=100, tol=1e-4, random_state=42)

# Train the model
mlp.fit(X_train_scaled, y_train)

# Output the model accuracy on training and testing datasets
print(f"Training accuracy: {mlp.score(X_train_scaled, y_train):.4f}")
print(f"Test accuracy: {mlp.score(X_test_scaled, y_test):.4f}")


Training accuracy: 0.9333
Test accuracy: 0.9111




In [None]:
from sklearn.metrics import classification_report

y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))
