# Housing Data from Iowa

Model to predict housing prices in Iowa from 79 features

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read the data
X_full = pd.read_csv('./input/train.csv', index_col='Id')
X_test_full = pd.read_csv('./input/test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [3]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


# Evaluate from several models

the first step to fitting  a model is to define it from several models

In [4]:
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

The models will be scored using a function based on the mean_absolute_error

In [6]:
from sklearn.metrics import mean_absolute_error

In [7]:
# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print(f"Model {i+1} MAE: {mae:,.0f}")

Model 1 MAE: 24,015
Model 2 MAE: 23,741
Model 3 MAE: 23,529
Model 4 MAE: 23,997
Model 5 MAE: 23,707


In [8]:
my_model = model_3

In [9]:
# Fit the model to the training data
my_model.fit(X, y)

# Generate test predictions
preds_test = my_model.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

# Melbourne: Housing Data

Using the snapshot of Melbournee housing data the missing data columns will be dropped.

In [10]:
# Load Data
sauce = './input/melbourne_housing_data/melb_data.csv'
data = pd.read_csv(sauce)

# Selecting Target
y = data.Price

# Keep things simple: only use numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide the data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Create a funciton in which to score datasets based on different approaches to dropping values.

In [11]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Dropping values with missing columns

In [12]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print(f"MAE from Approach 1 (Drop columns with missing values): ${score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid):,.0f}")

MAE from Approach 1 (Drop columns with missing values): $183,550


# Simple Imputer

Now imputation, using  SimpleImputer, to replace missing values along each column. Alternative more complex methods include Regression Imputation.

In [13]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

print(f"MAE from Approach 2 (Imputation): ${score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid):,.0f}")

MAE from Approach 2 (Imputation): $178,166


In [14]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print(f"MAE from Approach 3 (An Extension to Imputation): ${score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid):,.0f}")

MAE from Approach 3 (An Extension to Imputation): $178,928


# Using median strategy imputer

In [15]:
# Imputation
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

# Imputation removed column names; put them back
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print(f"MAE (Your approach): ${mean_absolute_error(y_valid, preds_valid):,.0f}")

MAE (Your approach): $169,749


# Categoreical Variables

In [16]:
# Load Data
data = pd.read_csv(sauce)

# Selecting Target
y = data.Price

# Keep things simple: only use numerical predictors
X = data.drop(['Price'], axis=1)
# X = melb_predictors.select_dtypes(exclude=['object'])

# Divide the data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [17]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables: ", object_cols)

Categorical variables:  ['Type', 'Method', 'Regionname']


# Dropping Categorical Variables

In [18]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print(f"MAE from Approach 1 (Dropping Categorical Variables): ${score_dataset(drop_X_train, drop_X_valid, y_train, y_valid):,.0f}")

MAE from Approach 1 (Dropping Categorical Variables): $183,550


# Label Encoding

In [19]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing the original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])
    
print(f"MAE from Approach 2 (Label Encoding): ${score_dataset(label_X_train, label_X_valid, y_train, y_valid):,.0f}")

MAE from Approach 2 (Label Encoding): $175,062


# One-Hot Encoding

In [20]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (replaced with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print(f"MAE from Approach 3 (One-Hot Encoding): ${score_dataset(OH_X_train, OH_X_valid, y_train, y_valid):,.0f}")

MAE from Approach 3 (One-Hot Encoding): $176,704


# Pipelines

In [21]:
# Read the data
data = pd.read_csv(sauce)

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [22]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


`ColumnTransformer` is used to bundel together different preprocessing steps into the pipeline.

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model:
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Build the pipe processes:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing fo validation data, get preds
preds = my_pipeline.predict(X_valid)

#Eval the model
score = mean_absolute_error(y_valid, preds)
print('MAE: ', score)

MAE:  160679.18917034855


# Cross Validation

In [24]:
# Read the data
data = pd.read_csv(sauce)

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50, random_state=0))
                             ])

In [25]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


The `scoring` parameter allows a model quality have a chosen measure: here the Negative Mean Absolute Error is chosen.

Scikit Learn has a list of other options [here](https://scikit-learn.org/stable/modules/model_evaluation.html).

Scikit-learn's convention defines all metrics so higher numbers are better. Using negatives allows the numbers to be consistent with convention, though Negative MAE is almost never used elsewhere.

A single measure of model quality is desired to compare alternative models, so an average is taken across the experiments.

In [26]:
print(f"Average MAE score (across experiments): ${scores.mean():,.0f}")

Average MAE score (across experiments): $277,707


# Gradient Boosting

In [27]:
# Read the data
data = pd.read_csv(sauce)

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [29]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)
pred = my_model.predict(X_valid)
print(f"Mean Absolute Error: {mean_absolute_error(pred, y_valid):,.0f}")

Mean Absolute Error: 237,365


In [31]:
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

The input `early_stopping_rounds` allows the model to automatically find the ideal value. Setting `n_estimators` to a high value, `early_stopping_rounds` will find the optimal time for halting the iterations.

Random choice sometimes halts modeling after 1 validation doesn't see improvement. Setting `early_stopping_rounds=5` is a reasonable amount to halt the model after 5 straight rounds of deterioration.

The `eval_set` parameter sets aside data for the validation process when calculating the error for `early_stopping_round`.

In [34]:
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train,
            early_stopping_rounds=5,
            eval_set=[(X_valid, y_valid)],
            verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

Instead of getting predictions by adding to a list from each component model, the predictions can be multiplied by a small number called the `learning_rate` before adding them in. Each tree that is added to the ensemble has less effect and a higher value of `n_estimators` can be set without overfitting. The appropriate number of trees will be automatically determined if early stopping is used.

Generally a smaller `learning_rate` with a large `n_estimators` gives a more accurate XGBoost model, but it takes longer for the model to train since it has more iterations to cycle through. By default, `learning_rate=0.1` 

In [36]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train,
            early_stopping_rounds=5,
            eval_set=[(X_valid, y_valid)],
            verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

Larger datasets, where runtime is a consideration, may use parallelism to build models faster. The parameter `n_jobs` is usually equal to the number of cores of the machine. This does not affect smaller datasets.

This is useful for large datasets that would otherwise take a long time to run the `fit` command.

In [37]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train,
            early_stopping_rounds=5, 
            eval_set=[(X_valid, y_valid)],
            verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

# Data Leakage

Data leakage is when the training data contains information about the target but similar data is not available for prediciton. The training data and validation data may see high performance in testing but real datasets in production will perform poorly. The model will look accurate until used in real world problems where it will be inaccurate.

The two main types of leakage are:
1. Target Leakage
2. Train-Test Contamination

**Target Leakage** occurs when data included in the predictors will not be available at the time when predictions are made. Target Leakage needs to be thought of in terms of *timing* or *chronological order* as data becomes available, not wether the feature aids in predicitons.

For example, *pnumonia* and *antibiotic medicine*:

|got_pneumonia|	age|	weight|	male|	took_antibiotic_medicine|	...|
|:-:|:-:|:-:|:-:|:-:|:-:
|False|	65|	100|	False|	False|	...|
|False|	72|	130|	True|	False|	...|
|True|	58|	100|	False|	True|	...|

A strong relation is seen between pnumonia and taking antibiotic medicine, but taking medicine is a result of pnumonia. The training and validation data may contain this information but the real world data will state that `took_antibiotic_medicine` is **False**.

To prevent this data leakage any variable that is updated or created after the target value is relized should be excluded.

**Train-Test Contamination** occurs when the preprocessing of training and validation data are carried out individually. For example, using an imputer to fill in missing values on the training set but leaving the validating set blank may skew results.

This can be subtle but dangerous problem on more complex feature engineering. Scikit-learn pipelines makes this easier to avoid.

In [42]:
# Read the data
no_sauce = './input/aer-credit-card-data/AER_credit_card_data.csv'
data = pd.read_csv(no_sauce, 
                   true_values = ['yes'], false_values = ['no'])

# Select target
y = data.card

# Select predictors
X = data.drop(['card'], axis=1)

print("Number of rows in the dataset:", X.shape[0])
X.head()

Number of rows in the dataset: 1319


Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [44]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# Since there is no preprocessing, we don't need a pipeline (used anyway as best practice!)
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y, 
                            cv=5,
                            scoring='accuracy')

print("Cross-validation accuracy: %f" % cv_scores.mean())

Cross-validation accuracy: 0.981810


Experience finds that a 98% accurate model is rare. Uncommon enough that the data should be more closely inspected for target leakage.

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   bool   
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   bool   
 7   selfemp      1319 non-null   bool   
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: bool(3), float64(4), int64(5)
memory usage: 96.7 KB


* `card`: 1 if credit application accepted, 0 if not
* `reports`: Number of major derogatory reports
* `age`: Age in years plus twelths of a year
* `income`: Yearly income (divided by 10,000)
* `share`: Ratio of monthly credit card expenditure to yearly income
* `expenditure`: Average monthly credit card expenditure
* `owner`: 1 if owns home, 0 if not
* `selfempl`: 1 if self-employed, 0 if not
* `dependents`: 1 + number of dependents
* `months`: Months living at current address
* `majorcards`: Number of major credit cards held
* `active`: Number of active credit accounts

Suspicious looking variables include `expenditure`. Is this expenditure on this card or on cards used before applying?

To confirm this make a simple data comparison

In [50]:
expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]

print(f"Fraction of those who did not receive a card and had no expenditures: {(expenditures_noncardholders == 0).mean()}")
print(f"Fraction of those who did receive a card and had no expenditures: {(expenditures_cardholders == 0).mean():.2f}")

Fraction of thos who did not receive a card and had no expenditures: 1.0
Fraction of thos who did receive a card and had no expenditures: 0.02


This shows that everyone who did not receive a card had no expenditure while only 2% of those who recieved the card had expenditures. This is a strong correlation which would create a high accuracy but by data leakage as the high expenditure probably mean with the applied for card.

Partially determined by `expenditure` is `share` so it should be excluded as well. The columns `active` and `majorcards` are not clear and may be concerning. If the people who created the data are unavailable then it is better to remove the columns

In [52]:
# Drop leaky predictors from dataset
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)

# Evaluate the model with the leaky predictors removed
cv_scores = cross_val_score(my_pipeline, X2, y,
                           cv=5,
                           scoring='accuracy')

print(f"Cross-val accuracy: {cv_scores.mean():.2f}")

Cross-val accuracy: 0.84


Accuracy is lower, which might be dissapointing, but 80% accuracy is much better than a leaky model (despite it's apparent score in cross validation)