In [None]:
# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# we want our plots to appear inside the notebook
%matplotlib inline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

Load data

In [None]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
print(df.shape)
df.head()

In [None]:
# 27 most important features according to our EDA
cols = ['V'+str(i) for i in range(1, 29) if i != 25]
print(cols)


In [None]:
# selecting the 27 most important features according to our EDA 
X = df[cols]

y = df['Class'] # selecting the target variable

X.shape

In [None]:
val_count = df['Class'].value_counts()
weights = dict(1 / val_count) # to be used as class weights
weights


In [None]:
admin_cost = 2.5

In [None]:

# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount):
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

In [None]:

# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

In [None]:

# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame(index=ind)
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

In [None]:
from sklearn.pipeline import Pipeline

Fraud Sensitive model (Not cost sensitive)

In [None]:
# combined SMOTE and Edited Nearest Neighbors resampling for imbalanced classification
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
#build the lightgbm model
import lightgbm as lgb
fraud_sensitive_model = lgb.LGBMClassifier()
# define resampling
fraud_sensitive_scaler = SMOTEENN()
fraud_sensitive_pipe = Pipeline([('scaler', fraud_sensitive_scaler), ('model', fraud_sensitive_model)])



In [None]:
fraud_sensitive_scores = cross_validate(fraud_sensitive_pipe, np.array(X), y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, \
                                        return_estimator=True, error_score='raise')


In [None]:
#we get the metric score
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, np.array(X))
fraud_sensitive_results

In [None]:
fraud_sensitive_results.mean()

Cost Sensitive model

In [None]:

sample_weights = np.array([df['Amount'][ind] if fraud else admin_cost for ind, fraud in enumerate(y)])

In [None]:

fraud_sensitive_model = lgb.LGBMClassifier(class_weight=weights)
fraud_sensitive_pipe = Pipeline([('scaler', fraud_sensitive_scaler), ('model', fraud_sensitive_model)])

fraud_sensitive_scores = cross_validate(fraud_sensitive_pipe, np.array(X), y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, \
                                        return_estimator=True, error_score='raise')


In [None]:
cost_sensitive_results = get_metric_scores(cost_sensitive_scores, scaled_X)
cost_sensitive_results

In [None]:
cost_sensitive_results.mean()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
smote = SMOTE()
for train_idx, test_idx, in cv.split(scaled_X, y):
    X_train, y_train = scaled_X[train_idx], y[train_idx]
    X_test, y_test = scaled_X[test_idx], y[test_idx]
    X_train_oversampled, y_train_oversampled = smote.fit_sample(X_train, y_train)
    smote_lightGBM_model = lgb.LGBMClassifier()
    smote_lightGBM_model.fit(X_train_oversampled, y_train_oversampled )  
    pred = smote_lightGBM_model.predict(X_test)
    print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
    print(f'f-score: {f1_score(y_test, pred)}')

In [None]:
print(f'roc_auc_score: {roc_auc_score(y_test, pred)}')
print(f'f-score: {f1_score(y_test, pred)}')