In [1]:
def score_model(model):
    return cross_val_score(model, df_train, labels, scoring='roc_auc', cv = KFold)

In [None]:
def corr(var_name, df):
    corr = df['TARGET'].corr(df[var_name])
    
    median_repaid = df.loc[df['TARGET'] == 0, var_name].median()
    median_not_repaid = df.loc[df['TARGET'] == 1, var_name].median()
    
    plt.figure(figsize = (10,8))
    sns.kdeplot(df.loc[df['TARGET'] == 0, var_name], label = 'target = 0')
    sns.kdeplot(df.loc[df['TARGET'] == 1, var_name], label = 'target = 1')
    plt.xlabel(var_name)
    plt.ylabel('Density')
    plt.title('Density with TARGET')
    plt.legend();
    
    print('The correlation between {} and the TARGET is {}'.format(var_name, corr))
    print('')
    print('The median value for loan that was repaid is :', median_repaid)
    print('')
    print('The median value for loan that was not repaid is :', median_not_repaid)
    
#corr('previous_loan_counts', train)

In [None]:
def count_categorical(df, group_var, df_name):
    categorical = pd.get_dummies(df.select_dtypes('object'))
    categorical[group_var] = df[group_var] 
    
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    columns = []

    for var in categorical.columns.levels[0]:
        for stat in ['count', 'count_norm']:
            # Make a new column name for the variable and stat
            columns.append('%s_%s' % (var, stat))

    categorical.columns = columns
    
    return categorical

#bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')

In [None]:
def agg_numeric(df, group_var, df_name):
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

#bureau_agg_new = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')

In [None]:
def score(df, cv):
    
    y = df['TARGET']
    X = df.drop(columns = ['SK_ID_CURR', 'TARGET'], axis = 1)
    
    feature_names = list(X.columns)
    
    
    le = LabelEncoder()


    for col in X.columns:
        if X[col].dtypes == 'object':
            if len(list(X[col].unique())) <= 2:
                X[col] = le.fit_transform(X[col])
    
    X = pd.get_dummies(X)
    
    Imputer = SimpleImputer(strategy = 'median')
    
    scaler = MinMaxScaler(feature_range = (0,1))

    X = Imputer.fit_transform(X)
    
    X = scaler.fit_transform(X)
    
    
    model = LGBMClassifier()
    
    scores = cross_validate(model, X, y, cv = cv, scoring = 'roc_auc', return_train_score = True)
    
   
    return pd.DataFrame(scores)

In [None]:
def features_importance(df):

    y = df['TARGET']
    X = df.drop(columns = ['SK_ID_CURR', 'TARGET'], axis = 1)
    
    
    
    
    le = LabelEncoder()


    for col in X.columns:
        if X[col].dtypes == 'object':
            if len(list(X[col].unique())) <= 2:
                X[col] = le.fit_transform(X[col])
    
    X = pd.get_dummies(X)
    
    feature_names = list(X.columns)
    
    Imputer = SimpleImputer(strategy = 'median')
    
    scaler = MinMaxScaler(feature_range = (0,1))

    X = Imputer.fit_transform(X)
    
    X = scaler.fit_transform(X)

    
    model = LGBMClassifier()
    model1 = model.fit(X, y)
    
    
    feat_importances_values = model1.feature_importances_ 
    
    feat_importances = pd.DataFrame({'feature': feature_names, 'importance': feat_importances_values})
    
    return feat_importances.sort_values(by = 'importance' ,ascending = False).head(20)

In [None]:
def submission(train, test):
    
    
        y = train['TARGET']
        X_train = train.drop(columns = ['SK_ID_CURR', 'TARGET'], axis = 1)
        X_test = test.drop(columns = ['SK_ID_CURR'], axis = 1)
    
        
    
    
        le = LabelEncoder()


        for col in X_train.columns:
            if X_train[col].dtypes == 'object':
                if len(list(X_train[col].unique())) <= 2:
                    X_train[col] = le.fit_transform(X_train[col])
                    X_test[col] = le.transform(X_test[col])
                    
    
        X_train = pd.get_dummies(X_train)
        X_test = pd.get_dummies(X_test)
        
        X_train, X_test = X_train.align(X_test, join = 'inner', axis = 1)
    
        Imputer = SimpleImputer(strategy = 'median')
        
        scaler = MinMaxScaler(feature_range = (0,1))

        X_train = Imputer.fit_transform(X_train)
        X_test = Imputer.transform(X_test)
    

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        
        LGBM = LGBMClassifier().fit(X_train, y)

        # Make predictions on the test data
        predictions = LGBM.predict_proba(X_test)[:, 1]

        LGBM = pd.DataFrame()
        LGBM['SK_ID_CURR'] = test_id
        LGBM['TARGET'] = predictions
        LGBM.to_csv("LGBM.csv", index=False)

In [None]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
def objective(hyperparameters, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""
    
    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
     # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 42)
    
    # results to retun
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators 
    
    return [score, hyperparameters, iteration]

In [None]:
import itertools

def grid_search(param_grid, max_evals = MAX_EVALS):
    """Grid search algorithm (with limit on max evals)"""
    
    # Dataframe to store results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))
    
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    keys, values = zip(*param_grid.items())
    
    i = 0
    
    # Iterate through every possible combination of hyperparameters
    for v in itertools.product(*values):
        
        # Create a hyperparameter dictionary
        hyperparameters = dict(zip(keys, v))
        
        # Set the subsample ratio accounting for boosting type
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        # Evalute the hyperparameters
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
        
        i += 1
        
        # Normally would not limit iterations
        if i > MAX_EVALS:
            break
       
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results    

In [None]:
import random

random.seed(50)

def random_search(param_grid, max_evals = MAX_EVALS):
    """Random search for hyperparameter optimization"""
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(MAX_EVALS)))
    
    # Keep searching until reach max evaluations
    for i in range(MAX_EVALS):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']

        # Evaluate randomly selected hyperparameters
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
    
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results 

In [None]:
def poly_features(train, test, degree):
    
    print('Training shape :', train.shape)
    print('Testing shape :', test.shape)

    poly_feat = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]


    poly_feat_test = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]


    imputer = SimpleImputer(strategy = 'median')

    poly_target = poly_feat['TARGET']
    poly_feat = poly_feat.drop(columns = ['TARGET'])

    poly_feat = imputer.fit_transform(poly_feat)
    poly_feat_test = imputer.transform(poly_feat_test)

    #polynomial transformations
    poly_transformer = PolynomialFeatures(degree = degree)

    poly_feat = poly_transformer.fit_transform(poly_feat)
    poly_feat_test = poly_transformer.transform(poly_feat_test)

    print('\nPolynomial features shapes :', poly_feat.shape)
    print('Polynomial features shapes:' , poly_feat_test.shape)



    poly_feat = pd.DataFrame(poly_feat, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                               'EXT_SOURCE_3', 'DAYS_BIRTH']))

    poly_feat['TARGET'] = poly_target

    poly_corr = poly_feat.corr()['TARGET'].abs().sort_values(ascending = False)
    
    print('\nCorrelation between TARGET and polynomial features')
    print('')
    print(poly_corr)

    # Put test features into dataframe
    poly_feat_test = pd.DataFrame(poly_feat_test, 
                                      columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                    'EXT_SOURCE_3', 'DAYS_BIRTH']))

    # Merge polynomial features into training dataframe
    poly_feat['SK_ID_CURR'] = train['SK_ID_CURR']
    train_poly = train.merge(poly_feat, on = 'SK_ID_CURR', how = 'left')

    # Merge polynomial features into testing dataframe
    poly_feat_test['SK_ID_CURR'] = test['SK_ID_CURR']
    test_poly = test.merge(poly_feat_test, on = 'SK_ID_CURR', how = 'left')

    train_poly, test_poly = train_poly.align(test_poly, join = 'inner', axis = 1)

    print('\nTraining polynomial data shapes :', train_poly.shape)
    print('Testing polynomial data shapes :', test_poly.shape)
    
    return train_poly, test_poly

In [None]:
def feat_sel(train, test):
    
    tr_labels = train['TARGET']
    
    train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test = test.drop(columns = ['SK_ID_CURR'])
    
    print('Phase Label encoder')
    #LABEL ENCODER
    le = LabelEncoder()


    # Iterate through the columns
    for col in train:
        if train[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(train[col].unique())) <= 2:
                # Train on the training data
                le.fit(train[col])
                # Transform both training and testing data
                train[col] = le.transform(train[col])
                test[col] = le.transform(test[col])
                
    print('\nDummyfication')
    
    #GET DUMMY
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    
    #ALIGN
    
    train, test = train.align(test, join = 'inner', axis = 1)
    
    print('\nNumber of feature in the training data after label encoder and get dummy: ', train.shape[1])
    print('Number of feature in the testing data after label encoder and get dummy: ', test.shape[1])
    
    print('\nPhase correlation')
    
    #Remove Collinear Variables
    
    tr = 0.9

    corr = train.corr().abs()
    
    # Upper triangle of correlations
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    
    tr_drop = [columns for columns in upper.columns if any(upper[columns] > tr)]

    print('\nNumber of variable dropped because they were too correlated :',len(tr_drop))
    
    train1 = train.drop(columns = tr_drop)
    test1 = test.drop(columns = tr_drop)

    print('\nNumber of feature in the training data after the drop of the variable too much correlated',train1.shape[1])
    print('Number of feature in the testing data after the drop of the variable too much correlated',test1.shape[1])

    print('\nPhase Nan')
    #Remove Missing Values
    
    train_missing = (train1.isnull().sum() / len(train1)).sort_values(ascending = False)
    test_missing = (test1.isnull().sum() / len(test1)).sort_values(ascending = False)
    
    train_missing1 = train_missing.index[train_missing > 0.75]
    test_missing1 = test_missing.index[test_missing > 0.75]

    print('\nNumber of columns with more than 75% of missing values in train :', len(train_missing1))
    print('Number of columns with more than 75% of missing values in test :', len(test_missing1))
    
    train1 = train1.drop(columns = train_missing1 )
    test1 = test1.drop(columns = test_missing1 )
    
    train, test = train1.align(test1, join = 'inner', axis = 1)
    
    print('\nNumber of feature in the training data after removing missing values:', train.shape[1])
    print('Number of feature in the testing data after removing missing values :', test.shape[1])
    
    
    #MODELISATION
    
    coltrain = list(train.columns)
    coltest = list(test.columns)

    imputer = SimpleImputer(strategy = 'median')
    
    scaler = MinMaxScaler(feature_range = (0,1))

    train = imputer.fit_transform(train)
    test = imputer.transform(test)
    
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    
    print('\nStart of the feature selection with LGBM attributes : feature_importances_')
    print('')
    
    zero_imp = np.zeros(train.shape[1])
    
    while(len(zero_imp) > 0 ):
    
        model = LGBMClassifier()

        #fit the model twice to avoid overfitting
        feat_imp = np.zeros(train.shape[1])
    
        for i in range(2):
            X_train, X_valid, y_train, y_valid = train_test_split(train, tr_labels, test_size = 0.20, random_state = i)
    
            model.fit(X_train, y_train, early_stopping_rounds=100, 
                 eval_set = [(X_valid, y_valid)], eval_metric = 'auc', verbose = 200)
    
            feat_imp += model.feature_importances_
    
        feat_imp = feat_imp / 2

        feat_imp = pd.DataFrame({'features' : coltrain, 'importances' : feat_imp}).sort_values(by = 'importances', ascending = False)
    
        zero_imp = list(feat_imp[feat_imp['importances'] == 0.0]['features'])

        print('\nThe number of features with 0.0 importance is:', len(zero_imp))
        
        train = pd.DataFrame(train, columns = coltrain)
        test = pd.DataFrame(test, columns = coltest)

        train.drop(columns = zero_imp, inplace = True)
        test.drop(columns = zero_imp, inplace = True)

        print(train.shape)
        print(test.shape)
        
        coltrain = list(train.columns)
        coltest = list(test.columns)

        imputer = SimpleImputer(strategy = 'median')

        train = imputer.fit_transform(train)
        test = imputer.transform(test)
        
    print('\n End of the features selection, we now have {} variables'.format(train.shape[1]))
    return train, test