In [1]:
# Part 1: Setup and Data Loading

# Step 1: Install any needed libraries (if not already available)
!pip install -q pandas numpy scikit-learn matplotlib seaborn

# Step 2: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Step 3: Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"
column_names = [
    "A1", "A2", "A3", "A4", "A5", "A6", "A7",
    "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15", "Class"
]

df = pd.read_csv(url, header=None, names=column_names)

# Step 4: Preview the data
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (690, 16)


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [2]:
# Part 2: Data Preprocessing

# Step 1: Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Step 2: Identify columns with missing values
missing_cols = df.columns[df.isnull().any()]
print("Columns with missing values:", missing_cols.tolist())

# Step 3: Handle missing values
# For numerical columns, fill with mean
for col in ['A2', 'A3', 'A8', 'A11', 'A14']:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert to numeric
    df[col].fillna(df[col].mean(), inplace=True)

# For categorical columns, fill with mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)

# Step 4: Encode categorical variables using Label Encoding
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Step 5: Convert target column ('Class') to 0 and 1
# Assuming '+' means approved (1) and '-' means rejected (0)
# After encoding, this has already been done

# Step 6: Feature-target split
X = df.drop("Class", axis=1)
y = df["Class"]

# Step 7: Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Columns with missing values: ['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14']
Training set size: (552, 15)
Test set size: (138, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [1]:
df.head()

NameError: name 'df' is not defined

In [3]:
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (552, 15)
Test set size: (138, 15)
