In [1]:
# sklearn utilties 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# classifiers  
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

# sampling 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

# useful libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Bad key "text.kerning_factor" on line 4 in
/Users/eyobmanhardt/opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
import os, sys
import dropbox
#import pandas as pd
from zipfile import ZipFile
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))
from utils.data_extract_utils import extract_zip, extract_features_from_bureau

# say someting ....
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

# Connect to dropbox
dbx = dropbox.Dropbox('cHV7yAR0J6YAAAAAAAAAAVQ1NLCrOwerbaNltPWHslYXKuUTJ5_wfgJsuFcmx83o')

data = {}
for entry in dbx.files_list_folder('').entries:
    response = dbx.files_download('/{}'.format(entry.name))
    
    if 'zip' in entry.name:
        content = extract_zip(response[1].content)
    
        for file in content:
            df = pd.read_csv(file[1])
            data[entry.name.replace('.csv.zip', '')] = df
            
import gc


def get_temp_data(data):
    ## combine train and test set
    train_test = data['application_train'].append(data['application_test']).reset_index()

    # join tables: bureau, bureau_balance - 1 min to execute
    bb = extract_features_from_bureau(data['bureau'], data['bureau_balance'])
    
    # Join ID same datatype
    train_test['SK_ID_CURR'] = train_test['SK_ID_CURR'].astype('Int64')
    bb.index = bb.index.astype('Int64')

    # Join Bureau(s) and Application tables
    df = train_test.join(bb, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
    
    # drop index column generated by groupby
    df.drop(['index'], axis=1, inplace=True)
    
    # clean memory 
    del bb
    gc.collect()
    
    # preprocess 
    ## DAYS_EMPLOYED
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['DAYS_EMPLOYED'] =df['DAYS_EMPLOYED'].apply(lambda x: abs(x))

    df['BIRTH_IN_YEARS'] = df['DAYS_BIRTH'].apply(lambda x: abs(x)/365)

    # remove 4 instances
    df = df.loc[df['CODE_GENDER'] != 'XNA', :]


    # add new features 
    df['ALL_EXT_SOURCE_MEAN']=df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
    df['PAYMENT_RATE'] = df['AMT_ANNUITY']/df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL']/df['CNT_FAM_MEMBERS']
    df['INCOME_CREDIT_IN_PERCENTAGE '] = df['AMT_INCOME_TOTAL']/df['AMT_CREDIT']
    df['ANNUITY_INCOME_IN_PERCENTAGE'] = df['AMT_ANNUITY']/df['AMT_INCOME_TOTAL']

    ## label encoder for binary values
    bin_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

    replace_dict = {'CODE_GENDER': {'M': 0, 'F': 1},
                    'FLAG_OWN_CAR': {'Y': 0, 'N': 1}, 
                    'FLAG_OWN_REALTY':{'Y': 0, 'N': 1} }
    df.replace(replace_dict, inplace=True)
    
    # fix inf values
    df['AMT_CREDIT_DEBT_RATIO'] = df['AMT_CREDIT_DEBT_RATIO'].apply(lambda x: x if ~np.isinf(x) else 0)
    
    # test ids
    test_ids = df.loc[df['TARGET'].isnull(), 'SK_ID_CURR']
    
    # drop ids
    df.drop(columns=['SK_ID_CURR'], axis=1, inplace=True)
    
    return df, test_ids

## Cell takes 2m 30 sec to execute 

In [3]:
df, test_ids = get_temp_data(data)

X, y = (df.loc[df['TARGET'].notnull(), :].drop(columns=['TARGET'], axis=1), 
        df.loc[df['TARGET'].notnull(), 'TARGET'])

# test set for kaggle
X_test = df.loc[df['TARGET'].isnull(), :].drop(columns=['TARGET'], axis=1)


# df['AMT_CREDIT_DEBT_RATIO'] = df['AMT_CREDIT_DEBT_RATIO'].apply(lambda x: x if ~np.isinf(x) else 0)
# df.drop(columns=['SK_ID_CURR'], axis=1, inplace=True)

# train_df = df.loc[df['TARGET'].notnull(), :]
# test_df = df.loc[df['TARGET'].isnull(), :]

# y = train_df['TARGET']

# train_df.drop(columns= ['TARGET'], axis=1, inplace=True)
# test_df.drop(columns= ['TARGET'], axis=1, inplace=True)

# X = train_df

# X_train, X_val, y_train, y_val = train_test_split(train_df, y, test_size=0.2, random_state= 42, stratify=y)

# columns for pipeline
cat_cols = list(X.select_dtypes(include=object).columns)
num_cols = list(X.select_dtypes(include=[int, float]).columns)
len(cat_cols),len(num_cols)

## Cell takes 1m 59 sec to execute 

(13, 116)

In [5]:
## check 
test_ids.shape, X_test.shape

((48744,), (48744, 129))

In [4]:
## pipelines

# one hot encoding 
categorical_pipe = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy='most_frequent', add_indicator=False)),
  ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# numerical 
numerical_pipe = Pipeline(steps=[
    ('num_imp', SimpleImputer(strategy='median', add_indicator=False)),
    ("scale", StandardScaler())  
])


# transform columns 
column_transformer = ColumnTransformer(transformers=[    
    ('num_pip', numerical_pipe, num_cols),
    ('cat_pipe', categorical_pipe, cat_cols)
])

In [5]:
# apply preprocessing to X
X_trans = column_transformer.fit_transform(X)

In [7]:
print('before transformations shape:', X.shape)
print('after transformations shape:', X_trans.shape)

before transformations shape: (307507, 129)
after transformations shape: (307507, 249)


# base-case no sampling 

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, random_state= 42, stratify=y)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
print('classifaction report on training set')
print(classification_report(y_train, y_pred_train, labels=[0,1]))
print('--------------------------------------------------------------')
print('classifaction report on validation set')
print(classification_report(y_val, y_pred_val, labels=[0,1]))

classifaction report on training set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    226145
           1       1.00      1.00      1.00     19860

    accuracy                           1.00    246005
   macro avg       1.00      1.00      1.00    246005
weighted avg       1.00      1.00      1.00    246005

--------------------------------------------------------------
classifaction report on validation set
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56537
           1       0.59      0.00      0.01      4965

    accuracy                           0.92     61502
   macro avg       0.76      0.50      0.48     61502
weighted avg       0.89      0.92      0.88     61502



We know the model above is garbage - since the the the dummy classifier (i.e i==0) will score 96% accuracy; therefore the notion accuracy in this sense is meaningless. Our objective is to increase the f1-score for both classes. Ideally we want the f1-score for both classes to be close to 1 then we can use the accuracy score evaluate the performance of the model. We're going to attempt different sampling techniques to achieve our goal. 

# under sampling - majority class 

In [15]:
# Split data 
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, random_state= 42, stratify=y)

# perform cross validation on entire data 
model_under = make_pipeline(
    RandomUnderSampler(random_state=42),
    LogisticRegression(solver='lbfgs', random_state=42)
)

cv_results_under = cross_validate(
     model_under, X_trans, y, scoring="balanced_accuracy",
     return_train_score=True, return_estimator=True,
     n_jobs=-1
 )

# average cross validation score 
print("Balanced accuracy {} +/- {}".format(cv_results_under['test_score'].mean(), 
                                           cv_results_under['test_score'].std()))

scores = []
for fold_id, cv_model in enumerate(cv_results_under["estimator"]):
     scores.append(balanced_accuracy_score(y_val, cv_model.predict(X_val)))

# average score on each k fold model - on left out set: X_val
# this is to confirm avg cross validation score is not too optimistic 
# i.e this should be very similar to our first score 
print("Balanced accuracy {} +/- {}".format(np.mean(scores), 
                                           np.std(scores)))

Balanced accuracy 0.6825066176361723 +/- 0.002928926654912159
Balanced accuracy 0.6871553626682388 +/- 0.001388504080283691


# over-sampling the minority class - SMOTE

In [None]:
# This cell will overload your computer .. at least that's the case for me 

# Split data 
# X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, random_state= 42, stratify=y)

# # perform cross validation on entire data 
# model_smote = make_pipeline(
#     SMOTE(sampling_strategy='minority'),
#     LogisticRegression(solver='lbfgs', random_state=42)
# )

# cv_results_smote = cross_validate(
#      model_smote, X_trans, y, scoring="balanced_accuracy",
#      return_train_score=True, return_estimator=True,
#     cv= 3,
#      n_jobs=-1
#  )

# # average cross validation score 
# print("Balanced accuracy {} +/- {}".format(cv_results_smote['test_score'].mean(), 
#                                            cv_results_smote['test_score'].std()))

# scores = []
# for fold_id, cv_model in enumerate(cv_results_smote["estimator"]):
#      scores.append(balanced_accuracy_score(y_val, cv_model.predict(X_val)))

# # average score on each k fold model - on left out set: X_val
# # this is to confirm avg cross validation score is not too optimistic 
# # i.e this should be very similar to our first score 
# print("Balanced accuracy {} +/- {}".format(np.mean(scores), 
#                                            np.std(scores)))

***NOTE*** Over-sampling the minority class using synthtic data (i.e SMOTE) is going to increase the number of instances in our data frame. Therefore, calculating the cross-validation score where we put the sampling method alongside the ml model in a single pipeline (what we did above) is going to be unrealistic to compute with our new data set. At least for my computer (EYOB), it breaks my kernal. We can still use cross validation method - to provide a performance score for this sampling technique; however, our methodology will change slightly. Specfically, we will sample the data using smote sampling then we will manually split the data in k folds. The splitting needs to be done manually - so the model can be executed with reasonable time.

In [34]:
# smote sampling
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_trans,y)

# split data into k folds
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
score_val = []
for train_index, val_index in sss.split(X_sm, y_sm):
    # initalize model 
    model = LogisticRegression(solver='lbfgs', random_state=42,max_iter=1000)
    # fit model 
    model.fit(X_sm[train_index], y_sm[train_index])
    # test model
    score_val.append(balanced_accuracy_score(y_sm[val_index], model.predict(X_sm[val_index])))
## takes about 6 min to execute ... 2 min per fold 

0.7057111412456027


In [37]:
print(np.mean(score_val), '+/-', np.std(score_val))

0.7057111412456027 +/- 0.000725206404900559


In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_sm, y_sm, test_size=0.2, random_state= 42, stratify=y_sm)
logreg= LogisticRegression(solver='lbfgs', random_state=42,max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_train = logreg.predict(X_train)
y_pred_val = logreg.predict(X_val)
print('classifaction report on training set')
print(classification_report(y_train, y_pred_train, labels=[0,1]))
print('--------------------------------------------------------------')
print('classifaction report on validation set')
print(classification_report(y_val, y_pred_val, labels=[0,1]))

classifaction report on training set
              precision    recall  f1-score   support

           0       0.71      0.70      0.70    226145
           1       0.70      0.71      0.71    226146

    accuracy                           0.71    452291
   macro avg       0.71      0.71      0.71    452291
weighted avg       0.71      0.71      0.71    452291

--------------------------------------------------------------
classifaction report on validation set
              precision    recall  f1-score   support

           0       0.71      0.70      0.71     56537
           1       0.71      0.71      0.71     56536

    accuracy                           0.71    113073
   macro avg       0.71      0.71      0.71    113073
weighted avg       0.71      0.71      0.71    113073



In [9]:
X_test_trans = column_transformer.transform(X_test)

In [10]:
logreg_test_pred = logreg.predict_proba(X_test_trans )[:,1]

submission5_dict = {'SK_ID_CURR': test_ids, 
            'TARGET': logreg_test_pred }
logreg_submission5 = pd.DataFrame(submission5_dict)
logreg_submission5.head()

Unnamed: 0,SK_ID_CURR,TARGET
307511,100001,0.482375
307512,100005,0.803179
307513,100013,0.338258
307514,100028,0.240855
307515,100038,0.678379


In [11]:
#logreg_submission5.to_csv('submission5.csv', index=False) ## kaggle score 74%

SMOTE sampling provides the best score without tuning the parameters. The actual test set (unseen data) performed better than the validation set; however, not by too much. More importantly, this model is providing us with scores that's is algined with the validations performance(i.e over-fitting does not occur). Thus, we can consider the SMOTE model performance as our baseline and go from here. ---> SMOTE is going to be the sampling technique we use from now on!!   

# next step: Selecting the best features

How do we select the best feature for our ML model

***Brainstorm - notes from class***

Feature selections methods:
- Percent missing value 
    - remove features with high percent of missing values
- Amount of variation
    - remove feature that don't vary in values
- Pairwise-correlations 
    - drop features that correlate with another (only drop one)
- multicollinearity
- correlation with the target
- cluster analysis
- PCA
- forward/backward/stepwise selection 
- Lasso - drops coeif value to 0
- Tree based models 
    - feature importance 
    
***How do we check the feature we choose impacts our model***
- need to check whether or not the specific feature makes any contribution to the model good/bad

In [None]:
application = data['application_train'].append(data['application_test']).reset_index()


Drop features from application dataset 

- 

# model evaluation procedure 