In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [3]:
!unzip /content/credit_card_default_dataset.zip

Archive:  /content/credit_card_default_dataset.zip
  inflating: UCI_Credit_Card.csv     


In [4]:
credit_df=pd.read_csv('/content/UCI_Credit_Card.csv')

In [5]:
credit_df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


# Part A : Data Preprocessing and Imputation

In [6]:
np.random.seed(42)
missing_cols=['LIMIT_BAL', 'AGE', 'BILL_AMT1']
for col in missing_cols:
  mask=(credit_df['EDUCATION']==3) & (credit_df['MARRIAGE']==1) # assuming that it is difficult to determine the age of married individuals who have completed till high school education
  mask_idx=credit_df[mask].sample(frac=np.random.uniform(0.05, 0.1), random_state=42).index
  credit_df.loc[mask_idx, col]=np.nan
missing_summary=credit_df[missing_cols].isnull().sum() # no. of missing values in each column
print(missing_summary)

LIMIT_BAL    197
AGE          279
BILL_AMT1    248
dtype: int64


In [17]:
dataset_A=credit_df.copy()
for col in missing_cols:
  median_value=dataset_A[col].median()
  dataset_A[col]=dataset_A[col].fillna(median_value) # median imputation

print('\nMissing values after median imputation for Dataset A : ')
print(dataset_A[missing_cols].isnull().sum())


Missing values after median imputation for Dataset A : 
LIMIT_BAL    0
AGE          0
BILL_AMT1    0
dtype: int64


The median is preferred over the mean for imputation because it provides a more robust measure of central tendency in the presence of outliers or skewed distributions. Extreme values can disproportionately influence the mean since the mean is computed as the arithmetic average of all observations (every value contributes directly to its calculation). As a result, even a single unusually large or small value can pull the mean toward itself, distorting the overall representation of the dataset. This leads to biased imputations, especially when the data are not symmetrically distributed. Therefore, the median serves as a more reliable estimator in such cases, as it remains unaffected by the magnitude of extreme observations.

In [15]:
dataset_B=credit_df.copy()
target_col='LIMIT_BAL'
other_cols=['AGE', 'BILL_AMT1']

for col in other_cols:    # median imputation for handling missing values of other columns
  median_value=dataset_B[col].median()
  dataset_B[col]=dataset_B[col].fillna(median_value)

known_data_B=dataset_B[dataset_B[target_col].notnull()]
unknown_data_B=dataset_B[dataset_B[target_col].isnull()]


X_train_B=known_data_B.drop(columns=[target_col, 'ID', 'default.payment.next.month'])
y_train_B=known_data_B[target_col]
X_test_B=unknown_data_B.drop(columns=[target_col, 'ID', 'default.payment.next.month'])

lin_reg=LinearRegression()
lin_reg.fit(X_train_B, y_train_B)
y_pred_B=lin_reg.predict(X_test_B)

dataset_B.loc[unknown_data_B.index, target_col]=y_pred_B

print('\nMissing values after imputation for Dataset B : ')
print(dataset_B[target_col].isnull().sum())


Missing values after imputation for Dataset B : 
0


Data is said to be Missing at Random (MAR) when the probability of a data point being missing depends on other observed variables in the dataset, but not on the unobserved (missing) values themselves. The "missingness" of data points is systematically related to known, available data rather than the missing entries. Thus, the underlying assumption is that there exists a statistical relationship between the missingness pattern and certain observed features in the dataset.

In [16]:
dataset_C=credit_df.copy()
target_col='LIMIT_BAL'
other_cols=['AGE', 'BILL_AMT1']

for col in other_cols:    # median imputation for handling missing values of other columns
  median_value=dataset_C[col].median()
  dataset_C[col]=dataset_C[col].fillna(median_value)

known_data_C=dataset_C[dataset_C[target_col].notnull()]
unknown_data_C=dataset_C[dataset_C[target_col].isnull()]

X_train_C=known_data_C.drop(columns=[target_col, 'ID', 'default.payment.next.month'])
y_train_C=known_data_C[target_col]
X_test_C=unknown_data_C.drop(columns=[target_col, 'ID', 'default.payment.next.month'])

dec_tree_reg=DecisionTreeRegressor()
dec_tree_reg.fit(X_train_C, y_train_C)
y_pred_C=dec_tree_reg.predict(X_test_C)

dataset_C.loc[unknown_data_C.index, target_col]=y_pred_C

print('\nMissing values after imputation for Dataset C : ')
print(dataset_C[target_col].isnull().sum())


Missing values after imputation for Dataset C : 
0


# Part B : Model Training and Performance Assessment

In [10]:
scaler=StandardScaler()

Dataset A (Median Imputation)

In [11]:
X_a=dataset_A.drop(columns=['ID', 'default.payment.next.month'])
y_a=dataset_A['default.payment.next.month']

X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
X_a_train_scaled=scaler.fit_transform(X_a_train)
X_a_test_scaled=scaler.transform(X_a_test)

lgr_reg_a=LogisticRegression()
lgr_reg_a.fit(X_a_train_scaled, y_a_train)
y_a_pred=lgr_reg_a.predict(X_a_test_scaled)

print(classification_report(y_a_test, y_a_pred))
print("Accuracy = ", accuracy_score(y_a_test, y_a_pred))
print("Precision = ", precision_score(y_a_test, y_a_pred))
print("Recall = ", recall_score(y_a_test, y_a_pred))
print("F1 Score = ", f1_score(y_a_test, y_a_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4687
           1       0.70      0.24      0.35      1313

    accuracy                           0.81      6000
   macro avg       0.76      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000

Accuracy =  0.8101666666666667
Precision =  0.695067264573991
Recall =  0.2361005331302361
F1 Score =  0.35247299602046617


Dataset B (Linear Regression Imputation)

In [12]:
X_b=dataset_B.drop(columns=['ID', 'default.payment.next.month'])
y_b=dataset_B['default.payment.next.month']

X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(X_b, y_b, test_size=0.2, random_state=42)
X_b_train_scaled=scaler.fit_transform(X_b_train)
X_b_test_scaled=scaler.transform(X_b_test)

lgr_reg_b=LogisticRegression()
lgr_reg_b.fit(X_b_train_scaled, y_b_train)
y_b_pred=lgr_reg_b.predict(X_b_test_scaled)

print(classification_report(y_b_test, y_b_pred))
print("Accuracy = ", accuracy_score(y_b_test, y_b_pred))
print("Precision = ", precision_score(y_b_test, y_b_pred))
print("Recall = ", recall_score(y_b_test, y_b_pred))
print("F1 Score = ", f1_score(y_b_test, y_b_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4687
           1       0.70      0.24      0.35      1313

    accuracy                           0.81      6000
   macro avg       0.76      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000

Accuracy =  0.8103333333333333
Precision =  0.6957494407158836
Recall =  0.23686214775323686
F1 Score =  0.3534090909090909


Dataset C (Decision Tree Imputation)

In [13]:
X_c=dataset_C.drop(columns=['ID', 'default.payment.next.month'])
y_c=dataset_C['default.payment.next.month']

X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
X_c_train_scaled=scaler.fit_transform(X_c_train)
X_c_test_scaled=scaler.transform(X_c_test)

lgr_reg_c=LogisticRegression()
lgr_reg_c.fit(X_c_train_scaled, y_c_train)
y_c_pred=lgr_reg_c.predict(X_c_test_scaled)

print(classification_report(y_c_test, y_c_pred))
print("Accuracy = ", accuracy_score(y_c_test, y_c_pred))
print("Precision = ", precision_score(y_c_test, y_c_pred))
print("Recall = ", recall_score(y_c_test, y_c_pred))
print("F1 Score = ", f1_score(y_c_test, y_c_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4687
           1       0.69      0.24      0.35      1313

    accuracy                           0.81      6000
   macro avg       0.76      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000

Accuracy =  0.81
Precision =  0.6935123042505593
Recall =  0.2361005331302361
F1 Score =  0.3522727272727273


Dataset D (Drop Missing Values)

In [14]:
dataset_D=credit_df.copy()
dataset_D=dataset_D.dropna()

X_d=dataset_D.drop(columns=['ID', 'default.payment.next.month'])
y_d=dataset_D['default.payment.next.month']

X_d_train, X_d_test, y_d_train, y_d_test = train_test_split(X_d, y_d, test_size=0.2, random_state=42)
X_d_train_scaled=scaler.fit_transform(X_d_train)
X_d_test_scaled=scaler.transform(X_d_test)

lgr_reg_d=LogisticRegression()
lgr_reg_d.fit(X_d_train_scaled, y_d_train)
y_d_pred=lgr_reg_d.predict(X_d_test_scaled)

print(classification_report(y_d_test, y_d_pred))
print("Accuracy = ", accuracy_score(y_d_test, y_d_pred))
print("Precision = ", precision_score(y_d_test, y_d_pred))
print("Recall = ", recall_score(y_d_test, y_d_pred))
print("F1 Score = ", f1_score(y_d_test, y_d_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4659
           1       0.68      0.22      0.34      1286

    accuracy                           0.81      5945
   macro avg       0.75      0.60      0.61      5945
weighted avg       0.79      0.81      0.77      5945

Accuracy =  0.8092514718250631
Precision =  0.6784037558685446
Recall =  0.22472783825816486
F1 Score =  0.33761682242990654


# Part C : Comparative Analysis

Model | Accuracy | Precision | Recall | F1 Score
--- | --- | --- | --- | ---
Dataset A (Median Imputation) | 0.8102 | 0.6951 | 0.2361 | 0.3525
Dataset B (Linear Regression Imputation) | 0.8103 | 0.6957 | 0.2369 | 0.3534
Dataset C (Decision Tree Imputation) | 0.8100 | 0.6935 | 0.2361 | 0.3523
Dataset D (Drop Missing Values) | 0.8093 | 0.6784 | 0.2247 | 0.3376

Listwise deletion removes all rows with any missing values, leading to a smaller and potentially biased dataset, especially if the missingness is not completely random. In contrast, imputation preserves data size and information by filling in missing values. The trade-off revolves around bias and imputation simplicity. Primary advantage of listwise deletion is that it is easy to implement and does not introduce any artificial data. However, by discarding incomplete observations, it reduces the sample size, which leads to inherently biased data if the missing samples are not missing completely at random (MCAR). On the other hand, imputation methods preserve the dataset size by filling in missing values based on observed information. This may introduce some degree of estimation error, but it generally maintains the model robustness and reliability for predictions, especially if the data points are missing at random (MAR). Thus model D performs poorly because it discards potentially informative data, distorts the underlying data distribution, and increases variance in model estimates. Imputation, though more complex, provides a more balanced approach by leveraging available information to approximate the missing values.

Based on the performance metrics it is observed that the linear regression imputation performs slightly better than the decision tree imputation. Linear regression assumes a linear relationship between the target variable and the predictors. Decision tree regressor does not make any assumption about the functional form between the predictors and target. They partition the feature space and assign constant prediction within each region. The relationship between the customer's credit limit (LIMIT_BAL) and monthly bill amounts (BILL_AMT) and payment amounts (PAY_AMT) is largely linear, as people with high credit limit have a higher spending tendency. Thus, better results are observed for the linear model. This assumption is not strictly linear, hence the model performs only slightly better than the non-linear model.

The best strategy to handle missing data is to perform **imputation using linear regression** for features that are largely linearly dependent on other observed variables. For unrelated variables, simpler methods such as median imputation should suffice. Based on the performance metrics - accuracy, precision, recall and F1 score, it is observed that the model performs best on the data imputed using linear regression. Conceptually, this is true because, if certain features are linearly dependent on other variables, linear regression imputation performs better than simple median imputation and more complex non-linear imputation. Since the credit limit is proportional to customers' spending and repayment behavior, the linear assumption is realistic. Features, which do not have strong dependence on other observed variables (age does not depend on the spending or repayment behavior), simpler imputation methods, like median imputation, can work effectively. These combined techniques provide the best performance and produce the most reliable classification results for predicting default risk