In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import gc
import matplotlib.pyplot as plt

In [2]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import timeit
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RepeatedKFold

In [3]:
%%time

df = pd.read_csv('../input/paysim1/PS_20174392719_1491204439457_log.csv')

In [4]:
df.shape

In [5]:
df.info(memory_usage="deep")

In [6]:
df.memory_usage(deep=True) * 1e-6

In [7]:
def convert_columns_to_catg(df, column_list):
    for col in column_list:
        print("converting", col.ljust(30), "size: ", round(df[col].memory_usage(deep=True)*1e-6,2), end="\t")
        df[col] = df[col].astype("category")
        print("->\t", round(df[col].memory_usage(deep=True)*1e-6,2))

In [8]:
convert_columns_to_catg(df, column_list=["nameDest","type"])

In [9]:
def downcast_df_int_columns(df):
    list_of_columns = list(df.select_dtypes(include=["int32", "int64"]).columns)
        
    if len(list_of_columns)>=1:
        max_string_length = max([len(col) for col in list_of_columns]) # finds max string length for better status printing
        print("downcasting integers for:", list_of_columns, "\n")
        
        for col in list_of_columns:
            print("reduced memory usage for:  ", col.ljust(max_string_length+2)[:max_string_length+2],
                  "from", str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8), "to", end=" ")
            df[col] = pd.to_numeric(df[col], downcast="integer")
            print(str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8))
    else:
        print("no columns to downcast")
    
    gc.collect()
    
    print("done")

In [10]:
downcast_df_int_columns(df)

In [11]:
df.info(memory_usage="deep")

In [12]:
df.memory_usage(deep=True) * 1e-6

* **Feature engineer:**

In [13]:
%%time
def amount_oldbalanceOrg(row):
    if row['oldbalanceOrg'] - row['amount'] == 0:
        return 'equal'
    else: 
        return 'not equal'
    
def dest_transaction_error(row):
    if row['newbalanceDest'] - row['oldbalanceDest'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def orig_transaction_error(row):
    if row['oldbalanceOrg'] - row['newbalanceOrig'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def get_name_prefix(row):
    return row['nameOrig'][0] + '-' + row['nameDest'][0]

def transaction_duration(row):
    if row['step'] / 24 < 1:
        return 'less than one day'
    elif row['step'] / 168 < 1:
        return 'less than a week'
    elif row['step'] / 744 < 1:
        return 'less than a month' 
    else:
        return 'month'    
    
df['amount_oldbalanceOrg'] = df.apply(amount_oldbalanceOrg, axis = 1)
df['orig_transaction_error'] = df.apply(orig_transaction_error, axis = 1)    
df['dest_transaction_error'] = df.apply(dest_transaction_error, axis = 1)
df['orig_dest'] = df.apply(get_name_prefix, axis = 1)
df['transaction_duration'] = df.apply(transaction_duration, axis = 1)

In [14]:
df.info(memory_usage="deep")

* **EDA**:

In [15]:
convert_columns_to_catg(df, column_list=["amount_oldbalanceOrg","orig_transaction_error",
                                        "dest_transaction_error", "orig_dest",
                                        "transaction_duration"])

In [16]:
df.to_csv('processed_data.csv')

In [20]:
trans_type = pd.DataFrame({'isFraud' : (df.groupby('type')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['type'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

transaction_duration = pd.DataFrame({'isFraud' : (df.groupby('transaction_duration')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['transaction_duration'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('orig_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

amount_oldbalanceOrg = pd.DataFrame({'isFraud' : (df.groupby('amount_oldbalanceOrg')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['amount_oldbalanceOrg'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

dest_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('dest_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['dest_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_dest = pd.DataFrame({'isFraud' : (df.groupby('orig_dest')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_dest'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

In [19]:
print('Fraud transaction = {}'.format((df[df['isFraud'] == 1].shape[0] / df.shape[0])*100))
print('Not Fraud transaction = {}'.format((df[df['isFraud'] == 0].shape[0] / df.shape[0])*100))

In [93]:
flag = pd.DataFrame({'isFraud' : (df.groupby('isFlaggedFraud')['isFraud'].agg('sum') /df['isFlaggedFraud'].value_counts()) * 100,
             'count': (df['isFlaggedFraud'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)
flag

In [94]:
flag['isFraud'].to_list()

In [66]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['type'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [67]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.pointplot(x='index',y='isFraud',data=trans_type, palette=['#008000'])

In [58]:
trans_type

In [78]:
trans_type['count'].to_list()

In [68]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['orig_dest'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [80]:
orig_dest 

In [69]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['orig_transaction_error'] , palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [53]:
orig_transaction_error

In [70]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(x = 'index',
            y = 'isFraud',
            data = amount_oldbalanceOrg, palette=[ '#6B8E23', "#9ACD32"])

In [72]:
amount_oldbalanceOrg

* **Data cleaning**

In [120]:
%%time

data = pd.read_csv('./processed_data.csv')

In [118]:
# remove useless rows and columns
data = data[data['orig_dest'] == "C-C"]
data = data[(data['type'] == "CASH_OUT") | (data['type'] == "TRANSFER")]
data.shape

In [5]:
data.to_csv("cleaned_data.csv")

In [121]:
print('Fraud transaction = {}'.format((data[data['isFraud'] == 1].shape[0] / data.shape[0])*100))
print('Not Fraud transaction = {}'.format((data[data['isFraud'] == 0].shape[0] / data.shape[0])*100))

In [122]:
data = data.drop([ 'amount_oldbalanceOrg',
       'orig_transaction_error', 'dest_transaction_error', 'orig_dest',
       'transaction_duration'],1)
data.shape

* **Train test split**

In [123]:
train, test, train_target, test_target = train_test_split(
    data.drop("isFraud",1), data["isFraud"], test_size=0.3)

In [124]:
(train_target.value_counts() / train.shape[0])*100

In [125]:
(test_target.value_counts()/ test.shape[0])*100

* **Pre-process the data:**

In [47]:
train.columns

In [126]:
required_features = [col for col in train.columns if col not in ("Unnamed: 0", "nameOrig", "nameDest")]
cat_cols = [col for col in required_features if train[col].dtypes == np.object]

In [127]:
encoder = OrdinalEncoder()
    
train[cat_cols] = encoder.fit_transform(train[cat_cols])
test[cat_cols] = encoder.transform(test[cat_cols])

In [128]:
X = train[required_features].values
Xtest = test[required_features].values
y = train_target.values 
ytest = test_target.values

* **build model, cv and training**

In [129]:
def metrics(yvalid, valid_preds):
    f1 = f1_score(yvalid, valid_preds)
    recall = recall_score(yvalid, valid_preds)
    precision = precision_score(yvalid, valid_preds)
    accuracy = accuracy_score(yvalid, valid_preds)
    return f1, recall, precision, accuracy

In [130]:
def run_model(cv, model,  key, req_features='' ):
    final_preds = []
    F1 = []
    Recall = []
    Precision = []
    Accuracy = []
    train_speed = 0
    test_speed = 0
    if key =='LGB':
        print('-' * 60)
        print("Start training lightgbm")
        print('-' * 60)    
        for i, (train_index, test_index) in enumerate(rkf.split(X)):
            X_train, X_val = X[train_index], X[test_index]
            y_train, y_val = y[train_index], y[test_index]
            print('Training')
            start = timeit.default_timer()
            model.fit(X_train,y_train)
            stop = timeit.default_timer()
            train_speed += stop - start
            
            print('Testing') 
            #Prediction session
            valid_preds = model.predict(X_val)
            conf_matrix = confusion_matrix(y_true=y_val, y_pred=valid_preds)
            print(conf_matrix)
            start = timeit.default_timer()
            test_preds = model.predict(Xtest)
            stop = timeit.default_timer()
            test_speed += stop - start
            print("Evaluate")
            final_preds.append(test_preds)
            #Performance evaluation
            f1, recall, precision, accuracy = metrics(y_val, valid_preds)
            F1.append(f1)
            Recall.append(recall)
            Precision.append(precision)
            Accuracy.append(accuracy)
            print("Fold {} ==> f1: {}, recall: {}, precision: {}, accuracy: {} \n".format(i + 1, f1, recall, precision, accuracy))
        print('-' * 30)
        print('Average F1 score: {}'.format(np.mean(F1)))
        print('Average Recall score: {}'.format(np.mean(Recall)))
        print('Average Precision score: {}'.format(np.mean(Precision)))
        print('Average Accuracy score: {}'.format(np.mean(Accuracy)))
        print('-' * 30)      
    elif key =='CAT':
        print('-' * 60)
        print("Start training catboost")
        print('-' * 60)    
        for i, (train_index, test_index) in enumerate(rkf.split(X)):
            X_train, X_val = X[train_index], X[test_index]
            y_train, y_val = y[train_index], y[test_index]
            test = Xtest.copy()
            
            print('Training')
            start = timeit.default_timer()
            model.fit(X_train,y_train,
                     eval_set=(X_val, y_val),
                     use_best_model=True)
            stop = timeit.default_timer()
            train_speed += stop - start

             
            print('Testing') 
            #Prediction session
            valid_preds = model.predict(X_val)
            conf_matrix = confusion_matrix(y_true=y_val, y_pred=valid_preds)
            print(conf_matrix)
            start = timeit.default_timer()
            test_preds = model.predict(Xtest)
            stop = timeit.default_timer()
            test_speed += stop - start
            print("Evaluate")
            final_preds.append(test_preds)
            #Performance evaluation
            f1, recall, precision, accuracy = metrics(y_val, valid_preds)
            F1.append(f1)
            Recall.append(recall)
            Precision.append(precision)
            Accuracy.append(accuracy)
            print("Fold {} ==> f1: {}, recall: {}, precision: {}, accuracy: {} \n".format(i + 1, f1, recall, precision, accuracy))
        print('-' * 30)
        print('Average F1 score: {}'.format(np.mean(F1)))
        print('Average Recall score: {}'.format(np.mean(Recall)))
        print('Average Precision score: {}'.format(np.mean(Precision)))
        print('Average Accuracy score: {}'.format(np.mean(Accuracy)))
    else:
        print('-' * 60)
        print("Start training xgboost")
        print('-' * 60)    
        for i, (train_index, test_index) in enumerate(rkf.split(X)):
            X_train, X_val = X[train_index], X[test_index]
            y_train, y_val = y[train_index], y[test_index]
            test = Xtest.copy()
            
            print('Training')
            start = timeit.default_timer()
            model.fit(X_train,y_train)
            stop = timeit.default_timer()
            train_speed += stop - start

             
            print('Testing') 
            #Prediction session
            valid_preds = model.predict(X_val)
            conf_matrix = confusion_matrix(y_true=y_val, y_pred=valid_preds)
            print(conf_matrix)
            start = timeit.default_timer()
            test_preds = model.predict(Xtest)
            stop = timeit.default_timer()
            test_speed += stop - start
            print("Evaluate")
            final_preds.append(test_preds)
            #Performance evaluation
            f1, recall, precision, accuracy = metrics(y_val, valid_preds)
            F1.append(f1)
            Recall.append(recall)
            Precision.append(precision)
            Accuracy.append(accuracy)
            print("Fold {} ==> f1: {}, recall: {}, precision: {}, accuracy: {} \n".format(i + 1, f1, recall, precision, accuracy))
        print('-' * 30)
        print('Average F1 score: {}'.format(np.mean(F1)))
        print('Average Recall score: {}'.format(np.mean(Recall)))
        print('Average Precision score: {}'.format(np.mean(Precision)))
        print('Average Accuracy score: {}'.format(np.mean(Accuracy)))
    
    return model, final_preds, np.mean(F1), np.mean(Recall), np.mean(Precision), np.mean( Accuracy),train_speed, test_speed

In [133]:
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)
model_lgb_def = lgb.LGBMClassifier(**lgbm_params)
# model_cat_def = cb.CatBoostClassifier(**catboost_params)
# model_xgb_def = xgb.XGBClassifier(**xgb_params)

In [None]:
model_lgbm, final_preds_lgbm, F1_lgbm, Recall_lgbm, Precision_lgbm, accuracy_lgbm, train_speed_lgbm, test_speed_lgbm = run_model(rkf, model_lgb_def, 
                                                                                    key='LGB',
                                                                                    req_features=required_features)
# model_cat, final_preds_cat, F1_cat, Recall_cat, Precision_cat, accuracy_cat, train_speed_cat, test_speed_cat = run_model(rkf, model_cat_def, key='CAT',
#                                                                                req_features=required_features)
# model_xgb, final_preds_xgb, F1_xgb, Recall_xgb, Precision_xgb, accuracy_xgb, train_speed_xgb, test_speed_xgb = run_model(rkf, model_xgb_def,
#                                                                                     key='XGB',
#                                                                                     req_features=required_features)

In [114]:
print("train_speed: {}".format(train_speed_xgb))
print("test_speed: {}".format(test_speed_xgb))

In [115]:
preds = np.max(np.column_stack(final_preds_xgb), axis=1)
conf_matrix = confusion_matrix(y_true=ytest, y_pred=preds)
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
colormap = sns.color_palette("Greens")
sns.heatmap(conf_matrix, annot=True, cmap=colormap)

In [116]:
f1, recall, precision, accuracy = metrics(ytest, preds)
print('Average F1 score: {}'.format(np.mean(f1)))
print('Average Recall score: {}'.format(np.mean(recall)))
print('Average Precision score: {}'.format(np.mean(precision)))
print('Average Accuracy score: {}'.format(np.mean(accuracy)))

In [132]:
xgb_params = {'objective':"binary:logistic",
              "eval_metric":"logloss",
                "learning_rate":0.01,
                "max_depth":7,
                "min_child_weight":5,
                "colsample_bytree":0.5,
                "subsample":0.5,
                "n_estimators": 1000,
             "predictor": "gpu_predictor",
             "random_state":42,
              "use_label_encoder":False, 
              "verbosity" : 0}
lgbm_params = {
    "num_leaves":50,
                "objective":'binary',
               "is_unbalance":True,
                "learning_rate":0.01,
                "max_depth":5,
                "n_estimators":1000,
                "boosting_type":'gbdt',
                "verbose":-1,
            }

catboost_params = {"loss_function":'Logloss',
                   "eval_metric":"Recall",  
                    "learning_rate":0.01,
                    "depth":5,
                    "iterations":1000,
                     "task_type":"GPU",
                     "silent":True,
                  }

In [95]:
import gc
gc.collect()