## Ch5 exercises: cross-validation and the bootstrap

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy # importing SciPu library to do scientific computing, e.g. numerical integration, optimisaiton, linear algebra
import pandas as pd 
import math
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets, linear_model
from sklearn.utils import resample
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from collections import OrderedDict # provides specialised container datatypes that are alternatives to built-in data structures like lists, tuples, dictionaries etc

# OrderedDict is a dictionary subclass that maintains the order of inserted items, useful for when the order of elements is important

### Question 5

In [2]:
data = pd.read_csv('data/Default.csv', header = 0)

In [3]:
data.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [4]:
# separating the input features (X) from the target variable (Y)
X = data[['income', 'balance']]
y = data['default']

In [5]:
# splitting the data into testing and training sets using sklearn module 'test_train_split'
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### Notes on interpretation

The <code>train_test_split</code> function is part of the <code>sklearn.model_selection</code> module in <code>scikit-learn</code>. It allows you to randomly split the data into subsets for training and validation purposes. Here's what each argument in the function does:

- <code>X</code>: This is the input feature matrix (independent variables) that you want to split.
- <code>y</code>: This is the target variable (dependent variable) corresponding to the input features.
- <code>test_size</code>: This parameter specifies the proportion of the data that will be allocated for validation. In this case, <code>test_size=0.2</code> means that 20% of the data will be used for validation, and the remaining 80% will be used for training.
- <code>random_state</code>: This parameter allows you to specify a random seed for reproducibility. Setting it to a specific value like 42 ensures that the data split will be the same every time you run the code, which can be helpful for consistency. It can be set to any random number, e.g. 42 or 1. 42 is often seen because it's become convention in the DS community following the Hitch Hikers Guide to the Galaxy. 


In [6]:
# looking at the datasets we've now created... 
X_train.head()

Unnamed: 0,income,balance
9254,34103.87952,1018.56813
1561,28660.747508,62.17005
1670,40822.447413,1046.743543
6087,44125.718725,763.73528
6669,25730.917583,697.248633


In [7]:
# looking at the datasets we've now created... 
X_val.head()

Unnamed: 0,income,balance
6252,31507.089277,1435.662933
4684,42139.070269,771.789347
1731,21809.218509,0.0
4742,32803.832648,113.571264
4521,49903.597081,1358.132472


In [8]:
# looking at the datasets we've now created... 
y_train.head()

9254    No
1561    No
1670    No
6087    No
6669    No
Name: default, dtype: object

In [9]:
# looking at the datasets we've now created... 
y_val.head()

6252    No
4684    No
1731    No
4742    No
4521    No
Name: default, dtype: object

In [10]:
# creating a logistic regression model using scikit-learn's LogisticRegression class
model = LogisticRegression() 

In [11]:
# training the model with X_train features and y_train target variable
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_val)

In [13]:
print(y_pred)

['No' 'No' 'No' ... 'No' 'No' 'No']


In [14]:
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9655


#### Notes on interpretation
- Model seems to perform very well given highest possible accuracy is 1

In [15]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Model Parameters:", model.get_params())
print("Classes:", model.classes_)
print("Number of iterations:", model.n_iter_)
print("Training Accuracy:", model.score(X_train, y_train))

Coefficients: [[-0.00012841  0.0004443 ]]
Intercept: [-1.11621713e-06]
Model Parameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Classes: ['No' 'Yes']
Number of iterations: [17]
Training Accuracy: 0.966375


In [16]:
# using a for loop to repeat the process above with 3 different splits of the test/train sets
# then comparing results to assess variability in the model's performance

for i in range(3):
    # splitting the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=i) # using 'i' to iterate through different values for each split. I.e. creating three different splits of the data with different random seeds

    # creating a logistic regression model
    model = LogisticRegression()

    # training the model on the training set
    model.fit(X_train, y_train)

    # predicting the target variable for the validation set
    y_pred = model.predict(X_val)

    # calculating the accuracy score
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Split {i+1} - Validation Accuracy:", accuracy)

Split 1 - Validation Accuracy: 0.963
Split 2 - Validation Accuracy: 0.9705
Split 3 - Validation Accuracy: 0.9715


#### Notes on interpretation 

- Validation accuracy is consistent
- Overall validation accuracy is high
- Model appears to produce good and reliable results


#### Next I will create a new model, introducing the variable "student" which I will convert into an integer

In [17]:
# converting 'student' from string to integer
data['student'] = data['student'].map({'No': 0, 'Yes': 1})

In [18]:
# checking result of change
data.head()

Unnamed: 0,default,student,balance,income
0,No,0,729.526495,44361.625074
1,No,1,817.180407,12106.1347
2,No,0,1073.549164,31767.138947
3,No,0,529.250605,35704.493935
4,No,0,785.655883,38463.495879


In [19]:
# splitting dataset into features (X) and target variable (y)
X2 = data[['income', 'balance', 'student']] # using double brackets to select multiple coloumns from dataframe and create a new dataframe 
y2 = data['default'] # using single brackets to select one column and create a new series

In [20]:
# checking results
y2.head()

0    No
1    No
2    No
3    No
4    No
Name: default, dtype: object

In [21]:
# creating x3 training/validation sets again to test the accuracy of this model with 'student'

for i in range(3):
    # splitting the data into training and validation sets
    X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size=0.2, random_state=i) # using 'i' to iterate through different values for each split. I.e. creating three different splits of the data with different random seeds

    # creating a logistic regression model
    model2 = LogisticRegression()

    # training the model on the training set
    model2.fit(X2_train, y2_train)

    # predicting the target variable for the validation set
    y2_pred = model2.predict(X2_val)

    # calculating the accuracy score
    accuracy2 = accuracy_score(y2_val, y2_pred)
    print(f"Split {i+1} - Validation Accuracy:", accuracy2)
    


Split 1 - Validation Accuracy: 0.965
Split 2 - Validation Accuracy: 0.967
Split 3 - Validation Accuracy: 0.9695


#### Notes on interpretation

- The logistic regression model <i> with </i> the 'student' variable included appears to perform marginally better than the model without it
- However, the gains may be so slight that it might be worth excluding and opting for the simplier model instead

### Question 6

In [26]:
# setting random seed for reproducibility 
np.random.seed(42)

# converting 'default' column into binary values 1 and 0
data['default'] = data['default'].map({'No': 0, 'Yes': 1})

# In logistic regression, the target variable (also known as the dependent variable) should typically be binary. 
# By converting the 'default' column to binary values, we are preparing the data to be compatible with a logistic regression model.

# separating input features (X) from target variable (y)
X = data[['income', 'balance']]
y = data['default']

# fitting model using statsmodels (sm)
X = sm.add_constant(X) # adding a constant term to the predictor variables
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# computing the standard errors using "result.bse" of the statsmodel library 
std_errors = result.bse

ValueError: endog must be in the unit interval.