In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [2]:
def check_missing_columns(df: pd.DataFrame) -> pd.Series:
    """Check if any of the columns in the dataframe contains missing values.

    Args:
        df (pd.DataFrame): Dataframe to check.

    Returns:
        pd.Series: Series of boolean values to denote whether the column contains missing values.
    """
    return df.isnull().any()

### Sanity Checks

#### Check for missing values in the datasets

Check if the train dataset contains any missing values in its columns.

In [3]:
check_missing_columns(train)

id                False
industry           True
state             False
request_date      False
term              False
employee_count    False
business_new      False
business_type     False
location          False
other_loans       False
loan_amount       False
insured_amount    False
default_status    False
dtype: bool

Industry column in train dataset contains missing values. Let's dive deeper to see how many records contains missing industry.

In [4]:
train[train.industry.isna()]

Unnamed: 0,id,industry,state,request_date,term,employee_count,business_new,business_type,location,other_loans,loan_amount,insured_amount,default_status
1946,3771775001,,NH,20-Nov-09,12,1,New,0,Rural,N,$100.00,"$75,000.00",0


Let's replace that missing value with 'Unknown'

In [6]:
train.industry = train.industry.fillna('Unknown')


Check if the test dataset contains any missing values in its columns.

In [8]:
check_missing_columns(test)

id                False
industry          False
state             False
request_date      False
term              False
employee_count    False
business_new      False
business_type     False
location          False
other_loans       False
loan_amount       False
insured_amount    False
dtype: bool

Test dataset does not contain any missing values.

Check if the test dataset contains categorical values that are not present in train dataset.

In [18]:
def check_categories(test: pd.DataFrame, train: pd.DataFrame, categorical_fields: list = []) -> list:
    """Check if test dataset contains categorical values not present in train dataset. 

    Args:
        test (pd.DataFrame): test dataset.
        train (pd.DataFrame): train dataset.
        categorical_fields (list, optional): columns to check. Defaults to [].

    Returns:
        list: list of columns in test dataset which contains categorical values not present in train dataset
    """
    columns_missing_categories = []

    for field in categorical_fields:
        unique_train_categories = set(train[field].unique())
        unique_test_categories = set(test[field].unique())

        print(f'Checking categorical column: {field}')
        print(f'Unique categories in train: {unique_train_categories}')
        print(f'Unique categories in test: {unique_test_categories}')

        missing_values = unique_test_categories - unique_train_categories
        print(f'Categories in test but not train: {missing_values}\n')

        if missing_values:
            columns_missing_categories.append(field)
    
    return columns_missing_categories

In [20]:
columns_missing_categories = check_categories(
    test, train, 
    categorical_fields = ['industry', 'state', 'business_new', 'business_type', 'location', 'other_loans']
)
columns_missing_categories

Checking categorical column: industry
Unique categories in train: {'Transportation', 'Education', 'Others', 'Administration', 'Energy', 'Engineering', 'Trading', 'Agriculture', 'Entertainment', 'Finance', 'Unknown', 'Real Estate', 'Hotel', 'Healthcare', 'Manufacturing', 'Consulting', 'Construction'}
Unique categories in test: {'Transportation', 'Others', 'Administration', 'Education', 'Energy', 'Trading', 'Engineering', 'Agriculture', 'Entertainment', 'Finance', 'Real Estate', 'Hotel', 'Healthcare', 'Manufacturing', 'Consulting', 'Construction'}
Categories in test but not train: set()

Checking categorical column: state
Unique categories in train: {'NM', 'NC', 'MA', 'WA', 'WY', 'AK', 'MI', 'CA', 'OH', 'LA', 'MS', 'VA', 'MD', 'IN', 'ID', 'OR', 'IL', 'SD', 'FL', 'IA', 'NJ', 'NV', 'NY', 'MN', 'AR', 'TN', 'UT', 'KY', 'SC', 'MO', 'VT', 'GA', 'PA', 'MT', 'ME', 'KS', 'AL', 'DE', 'ND', 'NE', 'HI', 'NH', 'WV', 'CO', 'CT', 'OK', 'TX', 'WI', 'AZ', 'RI'}
Unique categories in test: {'NM', 'NC', 'MA

[]

All of the categorical values present in test dataset can also be found in the train dataset.

In [10]:
train.head(1)

Unnamed: 0,id,industry,state,request_date,term,employee_count,business_new,business_type,location,other_loans,loan_amount,insured_amount,default_status
0,4050975007,Others,VA,27-Apr-10,34,4,New,0,Rural,N,"$35,000.00","$35,000.00",1


In [11]:
train.columns

Index(['id', 'industry', 'state', 'request_date', 'term', 'employee_count',
       'business_new', 'business_type', 'location', 'other_loans',
       'loan_amount', 'insured_amount', 'default_status'],
      dtype='object')

In [None]:
# TODO:
# 1) Sanity check for missing values in train, test (Done)
# 2) Sanity check for categorical values that exist in test but not in train (Done)
# 3) Cast 'request_date' to pandas datetime
# 4) Cast 'loan_amount' and 'insured_amount' to float
# 5) Create new feature, 'loan_insured_amount_diff', 'loan_amount' - 'insured_amount' 
# 6) Check request_date distribution, i.e. min and max date (I don't think we should generate features based on this, but if really want, we can generate day of week (1 - 7), day of month (1 - 31), week number (1 - 52), month (1 - 12), year (2009 - 2010))
# 7) Check probability distribution of categorical features ('industry', 'state', 'business_new', 'business_type', 'location', 'other_loans') (5 bar plots?)
# 8) Check distribution of 'term', 'employee_count', 'loan_amount', 'insured_amount' and 'loan_insured_amount_diff' + date based features (df.describe) (5 + 5 box/violin plots?)
# 9) Plot probability of default_status vs each categorical value (in 1 plot) for each categorical feature (5 bar plots)
# 10) Plot distribution of numerical feature for each default_status (5 + 5 bar/violin plots?)
# 11) Encode categorical features with LabelEncoder ('industry', 'state', 'business_new', 'business_type', 'location', 'other_loans')
# 12) Add steps 3, 4, 5, 6, 11 to pipeline for data preprocessing before ingestion into model for inference
# 13) Perform stratified k-fold validation to estimate the final number of training iterations and the average model performance on full dataset; total 16 features
# 14) Train model on full dataset based on the final number of training iterations derived from step 13
# 15) Use SHAP values to explain the contribution of each feature on the predictions (train/test dataset) + generate feature importance
# 16) Perform probability calibration (i.e. Platt Scaling/Isotonic Regression via CalibratedClassifierCV class) on the trained LightGBM model as the scores returned by the model is not true probability; have to perform this step in each fold of k-fold cross validation
# 17) Calculate ROC AUC based on calibrated probabilities
# 18) Plot ROC AUC/Precision Recall curve to determine the optimal probability threshold to set to predict default_status 1
# 19) Try multiple models (i.e Logistic Regression, LinearSVC) as part of model selection?
# 20) Feature selection? Feature importance based on SHAP values
# 21) A presentation to share your findings and takeaways to your non-technical business stakeholders to convince them that your approach is correct, effective, and deployable. 

In [26]:
train.business_type.value_counts(dropna = False)

business_type
0    2270
1     132
Name: count, dtype: int64

In [25]:
train.columns

Index(['id', 'industry', 'state', 'request_date', 'term', 'employee_count',
       'business_new', 'business_type', 'location', 'other_loans',
       'loan_amount', 'insured_amount', 'default_status'],
      dtype='object')

In [24]:
train.isnull().any().reset_index()

Unnamed: 0,index,0
0,id,False
1,industry,True
2,state,False
3,request_date,False
4,term,False
5,employee_count,False
6,business_new,False
7,business_type,False
8,location,False
9,other_loans,False


In [22]:
set(train.industry.unique())

{'Administration',
 'Agriculture',
 'Construction',
 'Consulting',
 'Education',
 'Energy',
 'Engineering',
 'Entertainment',
 'Finance',
 'Healthcare',
 'Hotel',
 'Manufacturing',
 'Others',
 'Real Estate',
 'Trading',
 'Transportation',
 nan}

In [19]:
test.industry.unique()

array(['Hotel', 'Construction', 'Transportation', 'Healthcare',
       'Consulting', 'Others', 'Trading', 'Manufacturing',
       'Administration', 'Agriculture', 'Finance', 'Entertainment',
       'Engineering', 'Real Estate', 'Education', 'Energy'], dtype=object)

In [16]:
train[train.industry.isna()]

Unnamed: 0,id,industry,state,request_date,term,employee_count,business_new,business_type,location,other_loans,loan_amount,insured_amount,default_status
1946,3771775001,,NH,20-Nov-09,12,1,New,0,Rural,N,$100.00,"$75,000.00",0


In [13]:
pd.to_datetime(train.request_date, format = '%d-%b-%y')

0      2010-04-27
1      2009-11-05
2      2010-02-26
3      2010-06-10
4      2010-09-23
          ...    
2397   2009-11-12
2398   2010-07-08
2399   2010-05-28
2400   2010-08-12
2401   2010-05-20
Name: request_date, Length: 2402, dtype: datetime64[ns]

In [6]:
len(train.id.unique())

2402

In [4]:
train.dtypes

id                 int64
industry          object
state             object
request_date      object
term               int64
employee_count     int64
business_new      object
business_type      int64
location          object
other_loans       object
loan_amount       object
insured_amount    object
default_status     int64
dtype: object

In [7]:
train.head()

Unnamed: 0,id,industry,state,request_date,term,employee_count,business_new,business_type,location,other_loans,loan_amount,insured_amount,default_status
0,4050975007,Others,VA,27-Apr-10,34,4,New,0,Rural,N,"$35,000.00","$35,000.00",1
1,3735095001,Manufacturing,CA,05-Nov-09,107,1,New,0,Rural,N,"$15,000.00","$13,500.00",1
2,3936555004,Trading,CA,26-Feb-10,84,1,New,0,Rural,Y,"$265,000.00","$100,000.00",0
3,4130405000,Engineering,MI,10-Jun-10,240,21,New,0,Rural,N,"$255,000.00","$255,000.00",0
4,4263615008,Education,NH,23-Sep-10,36,1,Existing,0,Rural,N,"$13,300.00","$6,650.00",0


In [10]:
train.other_loans.value_counts(dropna = False) / train.shape[0]

other_loans
N    0.724813
Y    0.275187
Name: count, dtype: float64