## Algorithm level approaches

In [3]:
import pandas as pd

from imblearn.combine import SMOTETomek

from sklearn.metrics import roc_auc_score as roc_auc, accuracy_score as acc
from sklearn.metrics import recall_score as recall, precision_score as precision

from setup import rfc, wrangle_data, split_data
from xgboost import XGBClassifier

# GDClassifier is a linear classifier (by default in sklearn it is a linear SVM) that uses SGD for training 
from sklearn.linear_model import SGDClassifier as SVM

seed = 42

**Data wrangling**

In [6]:
train = pd.read_csv('./train_ZoGVYWq.csv')
train = wrangle_data(train)

**Train and validation split and sample synthetic generation**

In [7]:
train_X, validation_X, train_Y, validation_Y = split_data(train, segmented=True)

sm = SMOTETomek(random_state=seed)
train_X_res, train_Y_res = sm.fit_sample(train_X, train_Y)

**Baseline**

In [8]:
rfc(train_X_res, train_Y_res, validation_X, validation_Y)

ROC-AUC: 0.61


**Weighted Random Forest Classifier**

In [44]:
print("With sampling technique")
rfc(train_X_res, train_Y_res, validation_X, validation_Y, class_weight={0:2, 1:1})

print("\nWithout sampling technique")
rfc(train_X, train_Y, validation_X, validation_Y, class_weight={0:2, 1:1})

With sampling technique
ROC-AUC: 0.60

Without sampling technique
ROC-AUC: 0.55


<br>**Support Vector Machine (SVM)**<br>
Example: [[link1]](http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html)

In [36]:
svm = SVM(random_state=seed)
svm.fit(train_X_res, train_Y_res)
print("ROC-AUC: {0:.2f}".format(roc_auc(validation_Y, svm.predict(validation_X))))

ROC-AUC: 0.75


<br>**Ensemble methods**<br>
Boosting algorithm - XGBoost

Tunning:
[[link1]](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

In [52]:
xgb = XGBClassifier(learning_rate=0.005,
                    n_estimators=200, 
                    max_depth=4, 
                    subsample=0.8, 
                    colsample_bytree=0.5, 
                    objective= 'binary:logistic', 
                    n_jobs=-1, 
                    scale_pos_weight=1, 
                    random_state=seed)

xgb.fit(train_X_res, train_Y_res, eval_metric=roc_auc)

print("ROC-AUC: {0:.2f}".format(roc_auc(validation_Y, xgb.predict(validation_X))))

ROC-AUC: 0.72


`subsample`: Subsample ratio of the training instance <br>
`colsample_bytree`: Subsample ratio of columns when constructing each tree <br>
`scale_pos_weight`: Balancing of positive and negative weights 

**XGBoost: Precision and Recall**

In [53]:
print("Precision: {0:.2f}".format(precision(validation_Y, xgb.predict(validation_X), pos_label=0)))
print("Recall: {0:.2f}".format(recall(validation_Y, xgb.predict(validation_X)), pos_label=0))

Precision: 0.32
Recall: 0.92


**Further examples** <br>
[Fraud detection with SMOTE and XGBoost](https://www.kaggle.com/bonovandoo/fraud-detection-with-smote-and-xgboost-in-r<br>)
<br>[Imbalanced data - XGBoost Tunning](https://www.kaggle.com/saxinou/imbalanced-data-xgboost-tunning)