<h1><center>Group Assignment</h1>

## 1. Data Preprocessing

### 1.1 Data Loading

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Plot settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

In [13]:
train = pd.read_csv('Data/ProjectTrain.csv',index_col='SK_ID_CURR')
train_bur = pd.read_csv('Data/ProjectTrain_Bureau.csv',index_col='SK_ID_CURR')
test = pd.read_csv('Data/ProjectTest.csv',index_col='Index_ID')
test_bur = pd.read_csv('Data/ProjectTest_Bureau.csv',index_col='Index_ID')

# 50% + filtering numerical transform
train_trans_partial = pd.read_csv('Data/train_clean_trans.csv', index_col='SK_ID_CURR')
test_trans_partial = pd.read_csv('Data/test_clean_trans.csv', index_col='Index_ID')
# # full set numerical transform
train_trans_full = pd.read_csv('Data/train_trans_fullset.csv',index_col='SK_ID_CURR')
test_trans_full = pd.read_csv('Data/test_trans_fullset.csv',index_col='Index_ID')
# full imputed 
X_train_filled = pd.read_csv('Data/X_train_filled.csv',index_col='SK_ID_CURR')
X_test_filled = pd.read_csv('Data/X_test_filled.csv',index_col='Index_ID')
# # full imputed 
X_train_oh = pd.read_csv('Data/X_train_oh.csv',index_col='SK_ID_CURR')
X_test_oh = pd.read_csv('Data/X_test_oh.csv',index_col='Index_ID')
# Merged Data
train_join = train.merge(train_bur, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
test_join = test.merge(test_bur, how='left', left_on='Index_ID', right_on='Index_ID')
train_join_trans = pd.read_csv('Data/train_join_trans.csv',index_col='SK_ID_CURR')
test_join_trans = pd.read_csv('Data/test_join_trans.csv',index_col='Index_ID')
pearson_corr_features_join = pd.read_csv('Data/feature_join_corr.csv',index_col='index')
test_join.loc[test_join['CREDIT_ACTIVE'] == 'Bad debt', 'CREDIT_ACTIVE'] = 'Closed'
## well processed data
# original
X_train_filled_original = pd.read_csv('Data/Original_X_train.csv', index_col='SK_ID_CURR')
y_train_sub = pd.read_csv('Data/Original_y_train.csv', index_col='SK_ID_CURR')
# hold-out set
X_validate_filled_original = pd.read_csv('Data/Original_X_validate.csv', index_col='SK_ID_CURR')
y_validate_sub = pd.read_csv('Data/Original_y_validate.csv', index_col='SK_ID_CURR')
# test set
X_test_filled_original = pd.read_csv('Data/Original_X_test.csv',index_col='Index_ID')

### 1.2 Feature Selection

#### 1.2.1 Filtering for Feature Selection

##### 1.2.1.1 Fold Change

In [165]:
def fold_change(train):
    # data preparation
    train_target = train['TARGET'].copy # training target
    train_features = train.iloc[:,1:].copy() # training features
    # partition feature into two groups by target & Mean calc
    train_feature_oneMean = train_features.loc[train['TARGET'] == 1,:].mean()
    train_feature_zeroMean = train_features.loc[train['TARGET'] == 0,:].mean()
    # fold change calculation
    train_fold_change = pd.DataFrame(np.absolute(np.log2(np.divide(train_feature_oneMean, train_feature_zeroMean))))
    train_fold_change.columns = ['Mean'] # rename mean division result column
    train_fold_change_sort = train_fold_change.sort_values(by=['Mean'],ascending=False) # fold change calculation
    
    return train_fold_change_sort

##### 1.2.1.1 Pearson's Correlation

In [16]:
def Pearson_corr(train):
    # data preparation
    train_t = train.copy()
    train_features = train_t.drop(columns=['TARGET']).copy() # training features
    # corr calculation
    train_corr = pd.DataFrame((train_features.corrwith(train['TARGET'])).abs())
    train_corr.columns = ['Corr']
    train_corr_sort = train_corr.sort_values(by=['Corr'],ascending=False)
    
    return train_corr_sort

#### 1.2.2 Filter Out Columns with 50%+ Missing Val

In [8]:
def MissingValFilter(train, test):
    # pass value to new vars
    train_new = train.copy()
    test_new = test.copy()
    # the missing proportion in each column
    miss_propor = train_new.isna().sum()/train.shape[0]
    # filter out the column if the missing proportion is larger than 45%
    for i in range(0,train_new.shape[1]):
        if miss_propor[i] > 0.45:
            del train_new[miss_propor.index[i]]
            del test_new[miss_propor.index[i]]
    # filter out columns with only 1 level
    # 1 level index retrive
    level_summary = train_new.nunique()
    one_level_index = level_summary[level_summary==1].index
    # 1 level column filtering
    if one_level_index != 'nan':
        for i in range(0,one_level_index.shape[0]):
            del train_new[one_level_index[i]]
            del test_new[one_level_index[i]]
    return train_new, test_new

### 1.3 Encoding

#### 1.3.1 Nominal Data Encoding

In [75]:
def NumericalTransform(train_clean, test_clean):  
    from sklearn.preprocessing import LabelEncoder
    
    # pass value to new pars
    train_clean_trans_tmp = train_clean.copy()
    test_clean_trans_tmp = test_clean.copy()
    # retrieve column types
    types = train_clean_trans_tmp.dtypes
    types_num = types[types != 'object'].index # numerical type colnames
    types_cat = types[types == 'object'].index # numerical type colnames
    # numerical & categorial columns partition
    # numerical cols
    train_clean_trans_num = train_clean_trans_tmp[types_num].copy() # train
    test_clean_trans_num = test_clean_trans_tmp[types_num[1:]].copy() # test
    # cat cols
    train_clean_trans_cat = train_clean_trans_tmp[types_cat].copy() # train
    test_clean_trans_cat = test_clean_trans_tmp[types_cat].copy() # test
    
    # Categorical col encoding - label encoder
    for i in range(0,types_cat.shape[0]):
        le = LabelEncoder()
        # fit with the desired col, col in position 0 for this example
        fit_by = pd.Series([i for i in train_clean_trans_tmp[types_cat[i]].unique() if type(i) == str]) # train
        le.fit(fit_by)
        # Set transformed col leaving np.NaN as they are
        train_clean_trans_cat[types_cat[i]] = train_clean_trans_tmp[types_cat[i]].apply(lambda x: le.transform([x])[0] if type(x) == str else x) # train
        test_clean_trans_cat[types_cat[i]] = test_clean_trans_tmp[types_cat[i]].apply(lambda x: le.transform([x])[0] if type(x) == str else x) # test
        
    # cocat Numerical & Categorial Cols tgt
    # train
    frame = [train_clean_trans_num,train_clean_trans_cat]
    train_clean_trans = pd.concat(frame,axis=1)
    # test
    frame = [test_clean_trans_num,test_clean_trans_cat]
    test_clean_trans = pd.concat(frame,axis=1)
    
    return train_clean_trans, test_clean_trans

#### 1.3.2 Train Validation Sample

In [19]:
def trainValidateSample(train,y_train):
    # pass values to temp pars
    train_t = train.copy()
    y_train_t = y_train.copy()
    # sample the selected training set
    train_ex = train_t.sample(frac = 0.2185, replace = False, random_state = 1) # features
    y_train_ex = y_train[train_ex.index] # target
    # sample the selected validation set
    oppSubSample_index = train_t.index.isin(train_ex.index)
    subSample_val = train_t[~oppSubSample_index] # rebulid validation subsample
    validate_ex = subSample_val.sample (frac = 0.120, replace = False, random_state = 1)
    y_validate_ex = y_train[validate_ex.index]
    
    return train_ex, y_train_ex, validate_ex, y_validate_ex

#### 1.3.3 Categorical String Transformation

In [7]:
def oneHotEncoding(X_train_filled, X_validate_filled,X_test_filled, train, pearson_corr_features):
    from sklearn.preprocessing import OneHotEncoder
    
    # pass value to new pars
    X_train_filled_t = X_train_filled.copy()
    X_validate_filled_t = X_validate_filled.copy()
    X_test_filled_t = X_test_filled.copy()
    train_t = train.copy()
    pearson_corr_features_t = pearson_corr_features.copy()
    pearson_corr_features_t = pearson_corr_features_t.drop(labels=['EXT_SOURCE_1'])
    # retrieve column types
    types = train_t.dtypes
    types_cat = types[types=='object'].index # categorial type colnames
    unique_count = train_t.nunique()
    types_cat2 = unique_count[unique_count<=70].index # category<=70 all count in categorical type 
    pearson_corr_features_t = pearson_corr_features_t.loc[pearson_corr_features_t['Corr']>=0.01,:].index
    # extract binary and categorial features
    features_ex_cat=[]
    features_ex_num=[]
    for i in range(0,len(pearson_corr_features_t)):
        if ((pearson_corr_features_t[i] in types_cat.unique())|(pearson_corr_features_t[i] in types_cat2.unique())):
            features_ex_cat.append(pearson_corr_features_t[i])
        else:
            features_ex_num.append(pearson_corr_features_t[i])
            
    # convert to object type
    for col in range(0,len(features_ex_cat)):
        X_train_filled_t[features_ex_cat[col]] = X_train_filled_t[features_ex_cat[col]].astype(str)
        X_validate_filled_t[features_ex_cat[col]] = X_validate_filled_t[features_ex_cat[col]].astype(str)
        X_test_filled_t[features_ex_cat[col]] = X_test_filled_t[features_ex_cat[col]].astype(str)
    
    return X_train_filled_t, X_validate_filled_t, X_test_filled_t

### 1.4 Missing Value Imputation

#### 1.4.1 MICE

In [8]:
# MICE Imputer
def MiceImpute(train_clean_trans, validate_clean_trans, test_clean_trans):
    from fancyimpute import IterativeImputer
    
    imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=1)
    imputer.fit(train_clean_trans)
    X_train_filled_mice = pd.DataFrame(imputer.transform(train_clean_trans)) # train
    X_validate_filled_mice = pd.DataFrame(imputer.transform(validate_clean_trans)) # validate
    X_test_filled_mice = pd.DataFrame(imputer.transform(test_clean_trans)) # test
    # rename the column names
    X_train_filled_mice.columns = train_clean_trans.columns # train
    X_validate_filled_mice.columns = validate_clean_trans.columns # validate
    X_test_filled_mice.columns = test_clean_trans.columns # test
    # rename the row
    X_train_filled_mice.index = train_clean_trans.index # train
    X_validate_filled_mice.index = validate_clean_trans.index # validate
    X_test_filled_mice.index = test_clean_trans.index # test
    
    return X_train_filled_mice, X_validate_filled_mice, X_test_filled_mice

Using Theano backend.


#### 1.4.2 MODE

In [9]:
# Mode Imputer
def SimpleImpute(train_clean_trans, test_clean_trans,stra):
    from sklearn.preprocessing import Imputer
    
    imputer = Imputer(strategy=stra)
    X_train_filled_mode = imputer.fit_transform(train_clean_trans) # train
    X_test_filled_mode = pd.DataFrame(imputer.transform(test_clean_trans)) # test
    # rename the column names
    X_train_filled_mode.columns = train_clean_trans.columns # train
    X_test_filled_mode.columns = test_clean_trans.columns # test
    # rename the row
    X_train_filled_mode.index = train_clean_trans.index # train
    X_test_filled_mode.index = test_clean_trans.index # test
    
    return X_train_filled_mode, X_test_filled_mode

#### 1.4.3 Final Impute

In [10]:
def final_impute(train, validate, test, pearson_corr_features):
    train_t = train.copy()
    validate_t = validate.copy()
    test_t = test.copy()
    # MNAR Colnames
    mnar_colnames = pd.read_csv('Data/col_mnar.csv',index_col='index')
    mnar_colnames = (mnar_colnames.loc[mnar_colnames['val'] == -1,:]).index
    # extract feature with 1%+ features
    pearson_corr_features = pearson_corr_features.loc[pearson_corr_features['Corr']>=0.01,:].index
    # Total extracted data
    train_t_ex = train_t[pearson_corr_features].copy()
    validate_t_ex = validate_t[pearson_corr_features].copy()
    test_t_ex = test_t[pearson_corr_features].copy()
    # MNAR
    train_t_ex[mnar_colnames] = train_t_ex[mnar_colnames].fillna(100)
    validate_t_ex[mnar_colnames] = validate_t_ex[mnar_colnames].fillna(100)
    test_t_ex[mnar_colnames] = test_t_ex[mnar_colnames].fillna(100)
    # drop EXT_SOURCE_1 col since too much missing 
    train_t_ex = train_t_ex.drop(columns=['EXT_SOURCE_1'])
    validate_t_ex = validate_t_ex.drop(columns=['EXT_SOURCE_1'])
    test_t_ex = test_t_ex.drop(columns=['EXT_SOURCE_1'])
    # MICE
    X_train_filled_mice, X_validate_filled_mice, X_test_filled_mice = MiceImpute(train_t_ex, validate_t_ex, test_t_ex)
    
    return X_train_filled_mice, X_validate_filled_mice, X_test_filled_mice

### 1.5 Factor Analysis 

#### 1.5.1 FAMD

In [11]:
def princeFAMD (train, validate, test, n_comp, n_iter):
    from prince import FAMD
    
    famd = FAMD(n_components=n_comp, n_iter=n_iter, copy=True, engine='auto', random_state=4)
    # fit transform
    train_trans = famd.fit_transform(train)
    validate_trans = famd.fit_transform(validate)
    test_trans = famd.fit_transform(test)

    return train_trans, validate_trans, test_trans

#### 1.5.2 FAMD Optimal

In [None]:
def princeFAMD_Opt (train, n_comp, n_iter):
    from prince import FAMD

    famd = FAMD(n_components=n_comp, n_iter=n_iter, copy=True, engine='auto', random_state=4)
    # fit transform
    train_trans = famd.fit_transform(train)

    return famd, train_trans

## 2. Model Fitting

### 2.1 Benchmark KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def BmModel(X_train_filled, y_train):
    
# #     # Benchmark Knn
#     model_knn = KNeighborsClassifier()
#     # cross validation score
#     scores_knn = cross_val_score(model_knn, X_train_filled, y_train, cv=5, n_jobs=-1, scoring = 'accuracy')
#     cv_score_knn = scores_knn.mean() # avg cv score
#     print('Cross Validation Score in knn Benchmark:', cv_score_knn.round(3), '\n')

    # Benchmark RandomForest
    model_rf = RandomForestClassifier()
#     scores_rf_raw = cross_val_score(model_rf, X_train_raw, y_train, cv=5, n_jobs=-1, scoring = 'roc_auc')
    scores_rf = cross_val_score(model_rf, X_train_filled, y_train, cv=5, n_jobs=-1, scoring = 'roc_auc')
    # avg cv score
#     cv_score_rf_raw = scores_rf_raw.mean()
    cv_score_rf = scores_rf.mean() 
#     print('Cross Validation Score in random forest Benchmark (Raw Data):', cv_score_rf_raw.round(3), '\n')
    print('Cross Validation Score in random forest Benchmark:', cv_score_rf.round(3), '\n')
    
    return model_rf, cv_score_rf

### 2.2 Random Forest

In [None]:
def RF_cv(X_train, y_train, max_depth, min_samples_leaf):
    from sklearn.model_selection import RandomizedSearchCV
    from imblearn.ensemble import BalancedRandomForestClassifier
    
    brf_classifier = BalancedRandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1)
    tuning_parameters = {
    'max_depth': max_depth,
    'min_samples_leaf' : min_samples_leaf,
#     'sampling_strategy': ['majority','not minority'],
#     'replacement': [True, False],
#     'class_weight': [None, 'balanced']
    }
    
    brf_cls_search = RandomizedSearchCV(brf_classifier, tuning_parameters, cv= 5, return_train_score=True, scoring='f1')
    brf_cls_search.fit(X_train, y_train)
    brf_cls_search_best = brf_cls_search.best_estimator_
    print('Best parameters found by grid search:', brf_cls_search.best_params_, '\n')
    
    return brf_cls_search, brf_cls_search_best

## 3. Performance Evaluation

In [None]:
def evaluate(object, features, target):
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score
    
    y_pred = object.predict(features)
    # confusion matrix
    confusion =(confusion_matrix(target, y_pred)/target.shape).round(3)
    # performance report
    performance =classification_report(target, y_pred, digits = 3)
    # accuracy score
    accuracy = accuracy_score(target, y_pred).round(3)
    # AUC score
    auc = roc_auc_score(target, y_pred).round(3)
    
    return y_pred, confusion, performance, accuracy, auc

## 3. Main

#### 3.1 Data Preprocessomg

In [None]:
'''
SET 1: Original set generation
'''
# ## STEP 1: extract TARGET N Numerical Transform
# y_train = train['TARGET'] # extract the training target
# # train_trans_full, test_trans_full = NumericalTransform(train, test) #numerical transform of the whole dataset
# ## STEP 2: resample of the sub training n validation set
# train_sub, y_train_sub, validate_sub, y_validate_sub = trainValidateSample(train_trans_full,y_train)
# ## STEP 3: drop the target column in feature set
# train_sub, validate_sub = train_sub.drop(columns=['TARGET']), validate_sub.drop(columns=['TARGET'])
# ## STEP 4: impute missing value
# X_train_filled_original, X_validate_filled_original, X_test_filled_original = MiceImpute(train_sub, validate_sub, test_trans_full)
# ## STEP 5: output the data file
# # training set
# X_train_filled_original.to_csv('Data/Original_X_train.csv')
# y_train_sub.to_csv('Data/Original_y_train.csv')
# # hold-out set
# X_validate_filled_original.to_csv('Data/Original_X_validate.csv')
# y_validate_sub.to_csv('Data/Original_y_validate.csv')
# # test set
# X_test_filled_original.to_csv('Data/Original_X_test.csv')


'''
SET 2: Without Extra Features Set Generation
'''
## STEP 4: Pearson Correlation Calculation
# pearson_corr_features = Pearson_corr(train_trans_full)
## STEP 5: impute missing values
# X_train_filled_ne, X_validate_filled_ne, X_test_filled_ne= final_impute(train_sub, validate_sub, test_trans_full, pearson_corr_features)
# ## STEP 6: categorial string transformation
X_train_oh_ne, X_validate_oh_ne, X_test_oh_ne = oneHotEncoding(X_train_filled_ne, X_validate_filled_ne,X_test_filled_ne, train, pearson_corr_features)
# ## STEP 7: FAMD factor compression
# X_train_famd_ne, X_validate_famd_ne, X_test_famd_ne = princeFAMD(X_train_oh_ne, X_validate_oh_ne, X_test_oh_ne, 40, 5)
# ## STEP 8: output the data file
# # training set
# X_train_famd_ne.to_csv('Data/ne_X_train.csv')
# # hold-out set
# X_validate_famd_ne.to_csv('Data/ne_X_validate.csv')
# # test set
# X_test_famd_ne.to_csv('Data/ne_X_test.csv')


'''
SET 3: With Extra Features Set Generation
'''
# ## STEP 1: Numerical Transform
# train_trans_join, test_trans_join = NumericalTransform(train_join, test_join) #numerical transform of the whole dataset
# ## STEP 2: pearson corr calc
# pearson_corr_features_join = pd.read_csv('Data/feature_join_corr.csv',index_col='index')
# ## STEP 3: resample of the sub training n validation set
# train_join_sub, y_train_join_sub, validate_join_sub, y_validate_sub = trainValidateSample(train_trans_join,y_train)
# ## STEP 4:  drop the target column in feature set
# train_join_sub, validate_join_sub = train_join_sub.drop(columns=['TARGET']), validate_join_sub.drop(columns=['TARGET'])
# ## STEP 5: impute missing values
# X_train_filled_we, X_validate_filled_we, X_test_filled_we= final_impute(train_join_sub, validate_join_sub, test_trans_join, pearson_corr_features_join)
# ## STEP 6: categorial string transformation
# X_train_oh_we, X_validate_oh_we, X_test_oh_we = oneHotEncoding(X_train_filled_we, X_validate_filled_we,X_test_filled_we, train, pearson_corr_features_join)
# ## STEP 7: FAMD factor compression
# X_train_famd_we, X_validate_famd_we, X_test_famd_we = princeFAMD(X_train_oh_we, X_validate_oh_we, X_test_oh_we, 40, 5)
# ## STEP 8: output the data file
# # training set
# X_train_famd_we.to_csv('Data/we_X_train.csv')
# y_train_join_sub.to_csv('Data/we_y_train.csv')
# ## hold-out set
# # X_validate_famd_we.to_csv('Data/we_X_validate.csv')
# y_validate_sub.to_csv('Data/we_y_validate.csv')
# ## test set
# # X_test_famd_we.to_csv('Data/we_X_test.csv')