These are some plotting and model training tools

(1)missing_values_table:    Missing value statistics

(2)plot_stats:              Plot feature distribution and default rate

(3)plot_distribution:       Plot probability density map

(4)plot_corr:               Plot Correlation coefficient heatmap

(5)plot_bin：               Plot Bin division

(6)model:                   Model training

In [None]:
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # for making plots with seaborn

# Missing value statistics
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        print(f"missing ratio > 50% : {len(mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] > 50])}")
        print(f"missing ratio > 20% : {len(mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] > 20])}")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

##Drawing function, through which the distribution of data can be visually observed
def plot_stats(app_train, feature,label_rotation=False,horizontal_layout=True):
    temp = app_train[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})

    # Calculate the percentage of defaulting users (TRAGRT=1)
    cat_perc = app_train[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
    cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
    # print(cat_perc)

    categories = temp.index.to_list()
    categories_count = [str(x) for x in categories]
    values_count = temp.values

    categories_rate = [str(int(x[0])) for x in cat_perc.values]
    values_rate = [x[1] for x in cat_perc.values]
    # print("categories_count：", categories_count)
    # print("categories_rate:", categories_rate)
    # print("values_count:", values_count)
    # print("values_rate", values_rate)

    colors = sns.color_palette("husl", n_colors=50)

    fig = plt.figure()
    plt.subplot(2,1, 1)
    plt.bar(
        x=categories_count,  # x
        height=values_count, 
        color=colors[0:len(categories_count)],  # color
        width=0.4 
    )
    plt.xticks(rotation=45, ha="right", fontsize=10)
    plt.xlabel(feature, fontsize=10)
    plt.ylabel("Number of Features", fontsize=10)

    plt.subplot(2,1, 2)
    plt.bar(
        x=categories_rate, 
        height=values_rate,
        color=colors[0:len(categories_rate)],
        width=0.4 
    )
    plt.xticks(rotation=45, ha="right", fontsize=10)
    plt.xlabel(feature, fontsize=10)
    plt.ylabel("Percent of Default rate [%]", fontsize=10) 

    # Add overall title, located at the top
    plt.suptitle(f'Distribution of {feature} and Default rate by TARGET', fontsize=13, y=0.98)

    # Adjust layout and display
    fig.align_ylabels()
    plt.tight_layout()
    plt.savefig(feature + 'user_profile.png')
    

def plot_distribution(app_train, feature_list):
    # iterate through the new features
    plt.figure()
    for i, feature in enumerate(feature_list):
        
        # create a new subplot for each source
        # plt.subplot(1, 3, i + 1)
        # plot repaid loans
        sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, feature], label='TARGET == 0')
        # plot loans that were not repaid
        sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, feature], label='TARGET == 1')
        
        # Label the plots
        plt.title(f"Distribution of {feature}", fontsize=10)
        plt.xlabel(feature, fontsize=10)
        plt.ylabel("Density", fontsize=12)
        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        plt.legend()

    plt.tight_layout(rect=[0, 0.1, 1, 0.9])
    plt.savefig(f'domain_distribution.png')

def plot_corr(app_train, feature_list):
    feature_list.append('TARGET')
    ext_data = app_train[feature_list]
    ext_data_corrs = ext_data.corr()
    plt.figure(figsize = (14, 12))

    # Heatmap of correlations
    ax = sns.heatmap(ext_data_corrs, 
                     cmap = plt.cm.RdYlBu_r, 
                     vmin = -0.25, 
                     annot = True, 
                     vmax = 0.6,
                     annot_kws={"size": 14}, 
                     cbar_kws={"shrink": 0.8, "label": {"size": 20}}
    )

    ax.set_xticklabels(ax.get_xticklabels(), fontsize=14, rotation=45, ha="right") 
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=14, rotation=0)


    plt.title("Correlation Heatmap", fontsize=20, pad=20)
    plt.tight_layout()
    plt.savefig(f'domain_correlation_heatmap.png')

# Bin division drawing
def plot_bin(app_train, feature_list):
    for obj in feature_list:
        age_data=app_train[['TARGET', obj]]

        age_data[f'{obj}_BINNED']=pd.cut(age_data[obj], bins=np.linspace(age_data[obj].min(),age_data[obj].max(), num=11))
        age_groups=age_data.groupby(f'{obj}_BINNED').mean()

        fig = plt.figure()
        colors = sns.color_palette("husl", n_colors=50)
        plt.subplot(2,1 ,1)
        plt.bar(age_groups.index.astype(str), 
                age_data[f'{obj}_BINNED'].value_counts(),
                color=colors[0:len(age_groups.index.astype(str))],
                width=0.4)
        plt.xticks(rotation=45, ha="right", fontsize=10) 
        plt.xlabel(f'{obj}_BINNED', fontsize=10) 
        plt.ylabel("Number of Features", fontsize=10)

        plt.subplot(2,1, 2)
        plt.bar(age_groups.index.astype(str), 
                100*age_groups['TARGET'], 
                color=colors[0:len(age_groups.index.astype(str))],
                width=0.4)
        plt.xticks(rotation=45, ha="right", fontsize=10)
        plt.xlabel(f'{obj}_BINNED', fontsize=10)
        plt.ylabel("Percent of Default rate [%]", fontsize=10)

        plt.suptitle(f'Distribution of {obj} and Default rate by TARGET', fontsize=13, y=0.98)

        fig.align_ylabels()
        plt.tight_layout()
        plt.savefig(f'{obj}_BINNED_user_profile.png')

# model training for validation 
def model(features, test_features, cat_indices = [], n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    object_list = ['NAME_EDUCATION_TYPE','NAME_HOUSING_TYPE','NAME_INCOME_TYPE','NAME_TYPE_SUITE']
    cat_indices = []
    for obj in object_list:
         cat_indices.append(features.columns.get_loc(obj))

    for index in cat_indices:
         print(features.columns[index])
         print(test_features.columns[index])
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50,
                                   early_stopping_rounds = 100, verbose = 200,
                                   verbosity= -1)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics