In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# classifiers  
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

# sampling 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

# useful libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

In [2]:

EXTRACTRED_BUREAU_COLUMNS = ['AMT_CREDIT_DEBT_RATIO', 'CREDIT_DAY_OVERDUE', 'DPD_COUNTS']
                
def extract_features_from_bureau(bureau_df, bureau_balances_df):
    bureau_df['AMT_CREDIT_SUM_DEBT'] = bureau_df['AMT_CREDIT_SUM_DEBT'].fillna(value=0)
    bureau_df['AMT_CREDIT_DEBT_RATIO'] =  bureau_df['AMT_CREDIT_SUM_DEBT']/bureau_df['AMT_CREDIT_SUM']
    bureau_df['AMT_CREDIT_DEBT_RATIO'] = bureau_df['AMT_CREDIT_DEBT_RATIO'].replace([np.inf, -np.inf, np.nan], 0)

    bureau_df = bureau_df.set_index('SK_ID_BUREAU')
    
    DPD_STATUS_MAP = {
        'C': 0, 
        'X': 0,
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
    }
    def sum_of_dpd(x):
        min_month_balance = -96
        # Normalize by the min_month_balance, the further back the balance was the
        # lest weight we give it
        return np.sum(x['STATUS'].values * np.absolute((min_month_balance - x['MONTHS_BALANCE'].values)/min_month_balance))

    # Map statuses
    dpd_counts_df = bureau_balances_df.replace({"STATUS": DPD_STATUS_MAP})
    dpd_counts_df = dpd_counts_df.groupby(['SK_ID_BUREAU']).apply(sum_of_dpd)
    dpd_counts_df = pd.DataFrame(dpd_counts_df, columns=['DPD_COUNTS']);

    bureau_with_dpds = pd.concat([bureau_df, dpd_counts_df], axis=1)
    
    bureau_with_dpds = bureau_with_dpds[:][['SK_ID_CURR', *EXTRACTRED_BUREAU_COLUMNS]]
    
    # Further aggregation to make sure unique SK_ID_CURR are returned
    bureau_with_dpds = bureau_with_dpds.groupby(['SK_ID_CURR']).mean()
    
    return bureau_with_dpds.fillna(value=0)
                
def extract_features_from_installments_payments(installments_payments_df):
    def mis_instalment_payment(x):
        return np.mean(x['AMT_INSTALMENT'].values - x['AMT_PAYMENT'].values)

    # Let's create mis_instalment_payment attributes
    mis_instalment_payment = installments_payments_df.groupby('SK_ID_CURR').apply(mis_instalment_payment)
    mis_instalment_payment_df = pd.DataFrame(mis_instalment_payment, columns=['MIS_INSTALMENT_PAYMENTS'])
    mis_instalment_payment_df.fillna(value=0, inplace=True)
    
    return mis_instalment_payment_df

def get_clean_credit(df_credit_raw):
    useful = ['MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_TOTAL_RECEIVABLE','NAME_CONTRACT_STATUS_Completed','SK_ID_CURR','SK_DPD','SK_DPD_DEF']
        

        
    full_dummies = pd.get_dummies(df_credit_raw,columns = ['NAME_CONTRACT_STATUS'])
    full_trimmed = full_dummies[useful]
    dpd_counts_sum = full_trimmed.groupby(['SK_ID_CURR'])['SK_DPD'].sum().reset_index()
    dpd_df_counts_sum = full_trimmed.groupby(['SK_ID_CURR'])['SK_DPD_DEF'].sum().reset_index()
    full_trimmed['SK_DPD_SUM_CREDIT'] = dpd_counts_sum['SK_DPD']
    full_trimmed['SK_DPD_DEF_SUM_CREDIT'] = dpd_df_counts_sum['SK_DPD_DEF']
    
    full_not_nan = full_trimmed.fillna(value=0)
    
    return full_not_nan.drop(columns=['SK_DPD','SK_DPD_DEF'])

# Remove features that are highly correlated with each other for improving model simplicity
def remove_highly_correlated_columns(df_orig, threshold):
    df = df_orig.copy()
    corr = df.corr()
    col_corr = set()
    for i in range(len(corr.columns)):
        for j in range(i):
            if (corr.iloc[i, j] >= threshold) and (corr.columns[j] not in col_corr):
                column_name = corr.columns[i]
                col_corr.add(column_name)
                if column_name in df.columns:
                    print('REMOVING {} which is correlated with {}'.format(column_name, corr.columns[j]))
                    del df[column_name]
    return df

In [3]:
df_credit_card_balance = pd.read_csv('credit_card_balance.csv')
df_application_train = pd.read_csv('application_train.csv')
df_bureau = pd.read_csv('bureau.csv')
df_bureau_balance = pd.read_csv('bureau_balance.csv')

In [4]:
def get_temp_data():

    # join tables: bureau, bureau_balance - 1 min to execute
    bb = extract_features_from_bureau(df_bureau, df_bureau_balance)
    
    # Join ID same datatype
    df_application_train['SK_ID_CURR'] = df_application_train['SK_ID_CURR'].astype('Int64')
    bb.index = bb.index.astype('Int64')

    # Join Bureau(s) and Application tables
    df = df_application_train.join(bb, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
    
    # join credit tables
    df = df.join(get_clean_credit(df_credit_card_balance), how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')

    
#     # clean memory 

#     # preprocess 
#     ## DAYS_EMPLOYED
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['DAYS_EMPLOYED'] =df['DAYS_EMPLOYED'].apply(lambda x: abs(x))

    df['BIRTH_IN_YEARS'] = df['DAYS_BIRTH'].apply(lambda x: abs(x)/365)

#     # remove 4 instances
    df = df.loc[df['CODE_GENDER'] != 'XNA', :]


# #     # add new features 
    df['ALL_EXT_SOURCE_MEAN']=df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
    df['PAYMENT_RATE'] = df['AMT_ANNUITY']/df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL']/df['CNT_FAM_MEMBERS']
    df['INCOME_CREDIT_IN_PERCENTAGE '] = df['AMT_INCOME_TOTAL']/df['AMT_CREDIT']
    df['ANNUITY_INCOME_IN_PERCENTAGE'] = df['AMT_ANNUITY']/df['AMT_INCOME_TOTAL']

# #     ## label encoder for binary values
    bin_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

    replace_dict = {'CODE_GENDER': {'M': 0, 'F': 1},
                    'FLAG_OWN_CAR': {'Y': 0, 'N': 1}, 
                    'FLAG_OWN_REALTY':{'Y': 0, 'N': 1} }
    df.replace(replace_dict, inplace=True)
    
#     # fix inf values
    df['AMT_CREDIT_DEBT_RATIO'] = df['AMT_CREDIT_DEBT_RATIO'].apply(lambda x: x if ~np.isinf(x) else 0)
    
#     # test ids
    test_ids_left = df.loc[df['TARGET'].isnull(), 'SK_ID_CURR_left']
    test_ids_right =df.loc[df['TARGET'].isnull(), 'SK_ID_CURR_right']
#     # drop ids
    df.drop(columns=['SK_ID_CURR_left','SK_ID_CURR_right'], axis=1, inplace=True)
    
    return df, test_ids_left,test_ids_right


In [5]:
df, left_ids,right_ids = get_temp_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_trimmed['SK_DPD_SUM_CREDIT'] = dpd_counts_sum['SK_DPD']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_trimmed['SK_DPD_DEF_SUM_CREDIT'] = dpd_df_counts_sum['SK_DPD_DEF']


In [6]:
print(df)

        TARGET NAME_CONTRACT_TYPE  CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  \
0            1         Cash loans            0             1                0   
1            0         Cash loans            1             1                1   
2            0    Revolving loans            0             0                0   
3            0         Cash loans            1             1                0   
4            0         Cash loans            0             1                0   
...        ...                ...          ...           ...              ...   
307506       0         Cash loans            0             1                1   
307507       0         Cash loans            1             1                0   
307508       0         Cash loans            1             1                0   
307509       1         Cash loans            1             1                0   
307510       0         Cash loans            1             1                1   

        CNT_CHILDREN  AMT_I

In [7]:
y_values = df['TARGET']
x_values = df.drop(columns=['TARGET'])

In [8]:
cat_cols = list(x_values.select_dtypes(include=object).columns)
num_cols = list(x_values.select_dtypes(include=[int, float]).columns)
categorical_pipe = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy='most_frequent', add_indicator=False)),
  ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# numerical 
numerical_pipe = Pipeline(steps=[
    ('num_imp', SimpleImputer(strategy='median', add_indicator=False)),
    ("scale", StandardScaler())  
])


# transform columns 
column_transformer = ColumnTransformer(transformers=[    
    ('num_pip', numerical_pipe, num_cols),
    ('cat_pipe', categorical_pipe, cat_cols)
])

In [9]:
X_trans = column_transformer.fit_transform(x_values)

In [10]:
print(X_trans)

[[ 0.14212935 -0.47809925 -0.16614903 ...  0.          1.
   0.        ]
 [ 0.42679025  1.72543552  0.59267397 ...  0.          1.
   0.        ]
 [-0.42719245 -1.15288914 -1.40466987 ...  0.          1.
   0.        ]
 ...
 [-0.06662198  0.19537137  0.19804875 ...  0.          1.
   0.        ]
 [ 0.0092876  -0.56876069 -0.47632259 ...  0.          1.
   0.        ]
 [-0.04764458  0.1887526   1.51853738 ...  0.          1.
   0.        ]]


In [11]:
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_trans,y_values)

In [12]:
logit = LogisticRegression(random_state=0,solver='sag')
linearSVM = svm.LinearSVC()
neigh = KNeighborsClassifier(n_neighbors=3)
clf = RandomForestClassifier(max_depth=2, random_state=0)


In [13]:
accuracy_logit = cross_val_score(logit, X_sm, y_sm, cv=10,scoring='f1')



In [14]:
print("Accuracy of Logicistic Model with Cross Validation is:",accuracy_logit.mean() * 100)

Accuracy of Logicistic Model with Cross Validation is: 69.86540984219758


In [15]:
accuracy_svm = cross_val_score(linearSVM, X_sm, y_sm, cv=10,scoring='f1')



In [16]:
print("Accuracy of SVM Model with Cross Validation is:",accuracy_svm.mean() * 100)

Accuracy of SVM Model with Cross Validation is: 69.95957618751069


In [17]:
accuracy_clf = cross_val_score(clf, X_sm, y_sm, cv=10,scoring='f1')

KeyboardInterrupt: 