# Imports

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# model explanation
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, precision_recall_curve

from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import lightgbm as lgb

# catboost model
from catboost import CatBoostClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# import function
from sklearn.dummy import DummyClassifier

import os

# for warning ignore
import warnings
warnings.filterwarnings("ignore")

# Import Processed Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [4]:
# Create independent and Dependent features
columns = train_df.columns.tolist()
# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["Label"]]
# Store the variable we are predicting
target = "Label"
# Define a random state
state = np.random.RandomState(42)
X = train_df[columns]
y = train_df[target]

# print the shapes of X and Y
print(X.shape)
print(y.shape)

(611772, 35)
(611772,)


In [5]:
preprocessed_test_data = test_df.drop(["ID X PCODE"], axis=1)

In [7]:
# Models
model_lr_1 = LogisticRegression(random_state=0)
xgb = XGBClassifier()

In [12]:
def get_submission_file(model, filename):
    # converting to the matrix
    test_X = preprocessed_test_data.as_matrix().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({'ID X PCODE': test_df["ID X PCODE"], 'Label' : predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    df_submission.to_csv(submission_file_path, index=False)

## Logisitic Regression

In [9]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4)
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
# Train the model
trained_model_lr_1 = model_lr_1.fit(X_train, y_train)
y_pred = trained_model_lr_1.predict(X_test)

# Score the model on the validation data
score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
        
scores.append(score)
mean_score = np.array(scores).mean()

In [10]:
print('Accuracy scores of the model: {:.2f}'.format(mean_score))
print('\n Classification report of the model')
print('--------------------------------------')
print(report)
print('\n Confusion Matrix of the model')
print('--------------------------------------')
print(conf_matrix)

Accuracy scores of the model: 0.89

 Classification report of the model
--------------------------------------
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     54541
           1       0.23      0.02      0.03      6636

    accuracy                           0.89     61177
   macro avg       0.56      0.50      0.49     61177
weighted avg       0.82      0.89      0.84     61177


 Confusion Matrix of the model
--------------------------------------
[[54170   371]
 [ 6527   109]]


In [14]:
get_submission_file(trained_model_lr_1, "2.0-stratified logistic regression.csv")

## Hyperparameter Tuning : Logistic Regression

In [22]:
from sklearn.model_selection import GridSearchCV

In [24]:
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(model_lr_1, param_grid=parameters, cv=3)

In [25]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4)
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
# Train the model
trained_model_lr_1 = clf.fit(X_train, y_train)
y_pred = trained_model_lr_1.predict(X_test)

# Score the model on the validation data
score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
        
scores.append(score)
mean_score = np.array(scores).mean()

In [26]:
print('Accuracy scores of the model: {:.2f}'.format(mean_score))
print('\n Classification report of the model')
print('--------------------------------------')
print(report)
print('\n Confusion Matrix of the model')
print('--------------------------------------')
print(conf_matrix)

Accuracy scores of the model: 0.89

 Classification report of the model
--------------------------------------
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     54541
           1       0.23      0.01      0.01      6636

    accuracy                           0.89     61177
   macro avg       0.56      0.50      0.48     61177
weighted avg       0.82      0.89      0.84     61177


 Confusion Matrix of the model
--------------------------------------
[[54414   127]
 [ 6598    38]]


## XGBoost

In [15]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4)
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
# Train the model
trained_xgb_1 = xgb.fit(X_train, y_train)
y_pred = trained_xgb_1.predict(X_test)

# Score the model on the validation data
score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
        
scores.append(score)
mean_score = np.array(scores).mean()

In [16]:
print('Accuracy scores of the model: {:.2f}'.format(mean_score))
print('\n Classification report of the model')
print('--------------------------------------')
print(report)
print('\n Confusion Matrix of the model')
print('--------------------------------------')
print(conf_matrix)

Accuracy scores of the model: 0.96

 Classification report of the model
--------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     54541
           1       0.86      0.77      0.81      6636

    accuracy                           0.96     61177
   macro avg       0.92      0.88      0.90     61177
weighted avg       0.96      0.96      0.96     61177


 Confusion Matrix of the model
--------------------------------------
[[53730   811]
 [ 1535  5101]]


In [18]:
test_X = preprocessed_test_data.as_matrix().astype('float')

In [6]:
def get_submission_file(model, filename):
    # converting to the matrix
    test_X = preprocessed_test_data
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({'ID X PCODE': test_df["ID X PCODE"], 'Label' : predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    df_submission.to_csv(submission_file_path, index=False)

In [30]:
get_submission_file(trained_xgb_1, "stratified_xgboost1.csv")

## Hyperparameter tuning

In [31]:
tuned_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=1,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [32]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4)
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
# Train the model
trained_xgb_2 = tuned_xgb.fit(X_train, y_train)
y_pred = trained_xgb_2.predict(X_test)

# Score the model on the validation data
score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
        
scores.append(score)
mean_score = np.array(scores).mean()

In [33]:
print('Accuracy scores of the model: {:.2f}'.format(mean_score))
print('\n Classification report of the model')
print('--------------------------------------')
print(report)
print('\n Confusion Matrix of the model')
print('--------------------------------------')
print(conf_matrix)

Accuracy scores of the model: 0.96

 Classification report of the model
--------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     54541
           1       0.86      0.74      0.80      6636

    accuracy                           0.96     61177
   macro avg       0.92      0.86      0.89     61177
weighted avg       0.96      0.96      0.96     61177


 Confusion Matrix of the model
--------------------------------------
[[53762   779]
 [ 1740  4896]]


In [34]:
get_submission_file(trained_xgb_1, "xgboost_submission4.csv")

## LightGBM

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1234)

In [37]:
SEARCH_PARAMS = {'learning_rate': 0.4,
                'max_depth': 15,
                'num_leaves': 32,
                'feature_fraction': 0.8,
                'subsample': 0.2}

FIXED_PARAMS={'objective': 'binary',
             'metric': 'auc',
             'is_unbalance':True,
             'bagging_freq':5,
             'boosting':'dart',
             'num_boost_round':300,
             'early_stopping_rounds':30}

In [41]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)
params = {'metric':FIXED_PARAMS['metric'],
             'objective':FIXED_PARAMS['objective']}
model = lgb.train(params, train_data,                     
                     valid_sets=[valid_data],
                     num_boost_round=FIXED_PARAMS['num_boost_round'],
                     early_stopping_rounds=FIXED_PARAMS['early_stopping_rounds'],
                     valid_names=['valid'])

[LightGBM] [Info] Number of positive: 53176, number of negative: 436241
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 640
[LightGBM] [Info] Number of data points in the train set: 489417, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.108652 -> initscore=-2.104588
[LightGBM] [Info] Start training from score -2.104588
[1]	valid's auc: 0.932586
Training until validation scores don't improve for 30 rounds
[2]	valid's auc: 0.949224
[3]	valid's auc: 0.948716
[4]	valid's auc: 0.950446
[5]	valid's auc: 0.950827
[6]	valid's auc: 0.957779
[7]	valid's auc: 0.958199
[8]	valid's auc: 0.958221
[9]	valid's auc: 0.959992
[10]	valid's auc: 0.96037
[11]	valid's auc: 0.961388
[12]	valid's auc: 0.961773
[13]	valid's auc: 0.961662
[14]	valid's auc: 0.96178
[15]	valid's auc: 0.9623
[16]	valid's auc: 0.963788
[17]	valid's auc: 0.964349
[18]	valid's auc: 0.964577
[19]	valid'

In [42]:
score = model.best_score['valid']['auc']
score

0.9763867119096838

In [43]:
get_submission_file(model, "lightgbm_submission5.csv")

## Catboost model

In [7]:
catboost_model = CatBoostClassifier()

In [None]:
trained_catboost_model = catboost_model.fit(X_train, y_train,
                                           cat_features=['branch_code_1X1H', 
                                                         'branch_code_30H5', 'branch_code_49BM',
                                                        'branch_code_748L', 'branch_code_94KC', 'branch_code_9F9T',
                                                        'branch_code_BOAS', 'branch_code_E5SW', 'branch_code_EU3L',
                                                        'branch_code_O4JC', 'branch_code_O67J', 'branch_code_UAOD',
                                                        'branch_code_X23B', 'branch_code_XX25', 'branch_code_ZFER',
                                                         'occupation_category_code_56SI', 
                                                         'occupation_category_code_90QI',
                                                        'occupation_category_code_AHH5', 
                                                         'occupation_category_code_JD7X',
                                                        'occupation_category_code_L44T', 
                                                         'occupation_category_code_T4MS',
                                                        'marital_status_D', 'marital_status_F', 'marital_status_M',
                                                        'marital_status_P', 'marital_status_R', 'marital_status_S',
                                                        'marital_status_U', 'marital_status_W','IsMale'],
                                            eval_set=(X_valid, y_valid),
                                            verbose=False
                                           )