In [None]:
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
Train = pd.read_csv('Train.csv', index_col=0, na_values='na')
print(f'The DataFrame has {len(Train)} rows and {Train.shape[1]} columns.')
SampleSubmission = pd.read_csv('SampleSubmission.csv', index_col=0, na_values='na')
print(f'The DataFrame has {len(SampleSubmission)} rows and {SampleSubmission.shape[1]} columns.')
Test = pd.read_csv('Test.csv', index_col=0, na_values='na')
print(f'The DataFrame has {len(Test)} rows and {Test.shape[1]} columns.')
Variables = pd.read_csv('VariableDefinition.csv', index_col=0, na_values='na')

In [None]:
Train.head(3)

In [None]:
#List the columns for different datatypes:
print('List of all Columns: ')
print(Train.columns)
print('\n')
print('Integer Type: ')
Col_int = Train.select_dtypes(np.int64).columns
print(Col_int)
print('\n')
print('Float Type: ')
Col_float = Train.select_dtypes(np.float64).columns
print(Col_float)
print('\n')
print('Object Type: ')
Col_cat = Train.select_dtypes(object).columns
print(Col_cat)
print('\n')
print('Count:')
print(Train.dtypes.value_counts())

In [None]:
# GETTING Correllation matrix
corr_mat= Train.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')

In [None]:
#import preprocessing module
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Cobvert target label to numerical Data
le = LabelEncoder()
train['bank_account'] = le.fit_transform(train['bank_account'])

#Separate training features from target
X_train = train.drop(['bank_account'], axis=1)
y_train = train['bank_account']

y_train.head(2)

In [None]:
# function to preprocess our data from train models
def preprocessing_data(data):

    # Convert the following numerical labels from interger to float
    float_array = data[["household_size", "age_of_respondent", "year"]].values.astype(float)
    
    # categorical features to be onverted to One Hot Encoding
    categ = ["relationship_with_head",
             "marital_status",
             "education_level",
             "job_type",
             "country"]
    
    # One Hot Encoding conversion
    data = pd.get_dummies(data, prefix_sep="_", columns=categ)
    
    # Label Encoder conversion
    data["location_type"] = le.fit_transform(data["location_type"])
    data["cellphone_access"] = le.fit_transform(data["cellphone_access"])
    data["gender_of_respondent"] = le.fit_transform(data["gender_of_respondent"])
    
    # drop uniquid column
    data = data.drop(["uniqueid"], axis=1)
    
    # scale our data into range of 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    data = scaler.fit_transform(data)
    
    return data  

In [None]:
# preprocess the train data 
processed_train = preprocessing_data(X_train)
processed_test = preprocessing_data(test)

In [None]:
# the first train row
print(processed_train[:1])

In [None]:
# Split train_data
from sklearn.model_selection import train_test_split

X_Train, X_Val, y_Train, y_val = train_test_split(processed_train, y_train, stratify = y_train, 
                                                  test_size = 0.1, random_state=42)

# Best Model

In [None]:
import best_model
best_model, best_model_name, acc = best_model.bestClassificationModel(X, y)

print(best_model)
Train.describe().transpose()
print(best_model_name, ":", acc)

In [None]:
from sklearn import metrics

def performance_evaluation_report(model, X_cv, y_cv, show_plot=False, labels=None, show_pr_curve=False):
    '''
    Function for creating a performance report of a classification model.
    
    Parameters
    ----------
    model : scikit-learn estimator
        A fitted estimator for classification problems.
    X_cv : pd.DataFrame
        DataFrame with features matching y_test
    y_cv : array/pd.Series
        Target of a classification problem.
    show_plot : bool
        Flag whether to show the plot
    labels : list
        List with the class names.
    show_pr_curve : bool
        Flag whether to also show the PR-curve. For this to take effect, 
        show_plot must be True.
        
    Return
    ------
    stats : pd.Series
        A series with the most important evaluation metrics
    '''

    y_pred = model.predict(X_cv)
    y_pred_prob = model.predict_proba(X_cv)[:, 1]

    cm = metrics.confusion_matrix(y_cv, y_pred)
    tn, fp, fn, tp = cm.ravel()

    fpr, tpr, threshold = metrics.roc_curve(y_cv, y_pred_prob)
    roc_auc = metrics.auc(fpr, tpr)

    precision, recall, thresholds = metrics.precision_recall_curve(
        y_cv, y_pred_prob)
    pr_auc = metrics.auc(recall, precision)

    if show_plot:

        if labels is None:
            labels = ['Negative', 'Positive']

        N_SUBPLOTS = 3 if show_pr_curve else 2
        PLOT_WIDTH = 15 if show_pr_curve else 12
        PLOT_HEIGHT = 5 if show_pr_curve else 6

        fig, ax = plt.subplots(
            1, N_SUBPLOTS, figsize=(PLOT_WIDTH, PLOT_HEIGHT))
        fig.suptitle('Performance Evaluation', fontsize=16)

        sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, cmap='BuGn_r', square=True, cbar=False, ax=ax[0],
                    annot_kws={"ha": 'center', "va": 'center'})
        ax[0].set(xlabel='Predicted label',
                  ylabel='Actual label', title='Confusion Matrix')
        ax[0].xaxis.set_ticklabels(labels)
        ax[0].yaxis.set_ticklabels(labels)

        ax[1].plot(fpr, tpr, 'b-', label=f'ROC-AUC = {roc_auc:.2f}')
        ax[1].set(xlabel='False Positive Rate',
                  ylabel='True Positive Rate', title='ROC Curve')
        ax[1].plot(fp/(fp+tn), tp/(tp+fn), 'ro',
                   markersize=8, label='Decision Point')
        ax[1].plot([0, 1], [0, 1], 'r--')
        ax[1].legend(loc='lower right')

        if show_pr_curve:

            ax[2].plot(recall, precision, label=f'PR-AUC = {pr_auc:.2f}')
            ax[2].set(xlabel='Recall', ylabel='Precision',
                      title='Precision-Recall Curve')
            ax[2].legend()

#         print('#######################')
#         print('Evaluation metrics ####')
#         print('#######################')
#         print(f'Accuracy: {metrics.accuracy_score(y_cv, y_pred):.4f}')
#         print(f'Precision: {metrics.precision_score(y_cv, y_pred):.4f}')
#         print(f'Recall (Sensitivity): {metrics.recall_score(y_cv, y_pred):.4f}')
#         print(f'Specificity: {(tn / (tn + fp)):.4f}')
#         print(f'F1-Score: {metrics.f1_score(y_cv, y_pred):.4f}')
#         print(f"Cohen's Kappa: {metrics.cohen_kappa_score(y_cv, y_pred):.4f}")

    stats = {'accuracy': metrics.accuracy_score(y_cv, y_pred),
             'precision': metrics.precision_score(y_cv, y_pred),
             'recall': metrics.recall_score(y_cv, y_pred),
             'specificity': (tn / (tn + fp)),
             'f1_score': metrics.f1_score(y_cv, y_pred),
             'cohens_kappa': metrics.cohen_kappa_score(y_cv, y_pred),
             'roc_auc': roc_auc,
             'pr_auc': pr_auc}

    return stats

In [None]:
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_cv)


LABELS = ['No Cliam', 'Cliam']
tree_perf = performance_evaluation_report(classifier, 
                                         X_cv, 
                                         y_cv, labels=LABELS, 
                                         show_plot=True)

plt.tight_layout()
plt.show()

# Building Hyperparameter

In [None]:


n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)] # Number of estimators
max_features = ['auto' , 'sqrt'] # Number of feature to consider at every split
max_depth = [2 , 4] # Maximum number of level in tree
min_sample_split = [2 , 5] # Minimum number of samples required to split a node
min_sample_leaf = [1 , 2] # Minimum number of sample required at each leaf node
bootstrap = [True , False] # Method of selecting sample for training each tree

In [None]:
Param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_sample_split,
    'min_samples_leaf': min_sample_leaf,
    'bootstrap': bootstrap
}

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Optimize model parameters
# I run this code in google colab to make the execution much faster and use the best params in the next code
param_grid = {'min_child_weighth': [1, 5, 10],
        'gamma': [0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5]
        }
my_xgb_model = GridSearchCV(xg_model, param_grid,n_jobs=-1,verbose=2,cv=5)
my_xgb_model.fit(X_Train, y_Train)
print(my_xgb_model.best_params_) 