In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# For reproducibility
np.random.seed(42)

df = pd.read_csv('Default.csv')
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


## (a) Fit a logistic regression model that uses `income` and `balance` to predict `default`


In [3]:
# Convert 'default' to a binary variable
df['default_binary'] = df['default'].map({'Yes': 1, 'No': 0})

# Features and target
X = df[['income', 'balance']]
y = df['default_binary']

# Fit logistic regression
logreg = LogisticRegression()
logreg.fit(X, y)

# Display coefficients
coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': logreg.coef_[0]})
coef_df


Unnamed: 0,Feature,Coefficient
0,income,2.1e-05
1,balance,0.005647


## (b) Validation Set Approach

In [4]:
# Split into training and validation sets (50% each)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

# Fit logistic regression on the training set
logreg_val = LogisticRegression()
logreg_val.fit(X_train, y_train)

# Predict probabilities and classes for the validation set
probs = logreg_val.predict_proba(X_val)[:, 1]
y_pred = (probs > 0.5).astype(int)

# Compute validation set error
val_error = 1 - accuracy_score(y_val, y_pred)
print(f'Validation set error (split 1): {val_error:.4f}')


Validation set error (split 1): 0.0258


## (c) Repeat the process in (b) three times with different splits

In [5]:
val_errors = []
for random_state in [42, 24, 101]:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=random_state)
    logreg_val = LogisticRegression()
    logreg_val.fit(X_train, y_train)
    probs = logreg_val.predict_proba(X_val)[:, 1]
    y_pred = (probs > 0.5).astype(int)
    error = 1 - accuracy_score(y_val, y_pred)
    val_errors.append(error)
    print(f'Validation set error (split with random_state={random_state}): {error:.4f}')

print(f"\nAverage validation set error: {np.mean(val_errors):.4f}")


Validation set error (split with random_state=42): 0.0258
Validation set error (split with random_state=24): 0.0300
Validation set error (split with random_state=101): 0.0246

Average validation set error: 0.0268


### Comment on the results

Notice the validation set error varies slightly with each split. This is expected due to randomness in the split. The errors are similar in magnitude, suggesting the model's test error is stable.


## (d) Add `student` as a dummy variable

We'll now include `student` in the model as an additional predictor.

In [6]:
# Add 'student' dummy variable
df['student_dummy'] = df['student'].map({'Yes': 1, 'No': 0})

# Features: income, balance, student
X2 = df[['income', 'balance', 'student_dummy']]

val_errors2 = []
for random_state in [42, 24, 101]:
    X_train, X_val, y_train, y_val = train_test_split(X2, y, test_size=0.5, random_state=random_state)
    logreg_val = LogisticRegression()
    logreg_val.fit(X_train, y_train)
    probs = logreg_val.predict_proba(X_val)[:, 1]
    y_pred = (probs > 0.5).astype(int)
    error = 1 - accuracy_score(y_val, y_pred)
    val_errors2.append(error)
    print(f'Validation set error with student (split {random_state}): {error:.4f}')

print(f"\nAverage validation set error (with student): {np.mean(val_errors2):.4f}")


Validation set error with student (split 42): 0.0256
Validation set error with student (split 24): 0.0304
Validation set error with student (split 101): 0.0246

Average validation set error (with student): 0.0269


### Comment

Compare the average validation set error with and without the `student` variable. If the error decreases, adding `student` helped. If not, it did not improve the model significantly.
