In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import gc
import matplotlib.pyplot as plt

In [36]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import timeit
import numpy as np

from sklearn.model_selection import train_test_split, RepeatedKFold

In [4]:
%%time

df = pd.read_csv('../input/paysim1/PS_20174392719_1491204439457_log.csv')

In [5]:
df.shape

In [6]:
df.info(memory_usage="deep")

In [7]:
df.memory_usage(deep=True) * 1e-6

In [8]:
def convert_columns_to_catg(df, column_list):
    for col in column_list:
        print("converting", col.ljust(30), "size: ", round(df[col].memory_usage(deep=True)*1e-6,2), end="\t")
        df[col] = df[col].astype("category")
        print("->\t", round(df[col].memory_usage(deep=True)*1e-6,2))

In [9]:
convert_columns_to_catg(df, column_list=["nameDest","type"])

In [10]:
def downcast_df_int_columns(df):
    list_of_columns = list(df.select_dtypes(include=["int32", "int64"]).columns)
        
    if len(list_of_columns)>=1:
        max_string_length = max([len(col) for col in list_of_columns]) # finds max string length for better status printing
        print("downcasting integers for:", list_of_columns, "\n")
        
        for col in list_of_columns:
            print("reduced memory usage for:  ", col.ljust(max_string_length+2)[:max_string_length+2],
                  "from", str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8), "to", end=" ")
            df[col] = pd.to_numeric(df[col], downcast="integer")
            print(str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8))
    else:
        print("no columns to downcast")
    
    gc.collect()
    
    print("done")

In [11]:
downcast_df_int_columns(df)

In [12]:
df.info(memory_usage="deep")

In [13]:
df.memory_usage(deep=True) * 1e-6

* **Feature engineer:**

In [14]:
%%time
def amount_oldbalanceOrg(row):
    if row['oldbalanceOrg'] - row['amount'] == 0:
        return 'equal'
    else: 
        return 'not equal'
    
def dest_transaction_error(row):
    if row['newbalanceDest'] - row['oldbalanceDest'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def orig_transaction_error(row):
    if row['oldbalanceOrg'] - row['newbalanceOrig'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def get_name_prefix(row):
    return row['nameOrig'][0] + '-' + row['nameDest'][0]

def transaction_duration(row):
    if row['step'] / 24 < 1:
        return 'less than one day'
    elif row['step'] / 168 < 1:
        return 'less than a week'
    elif row['step'] / 744 < 1:
        return 'less than a month' 
    else:
        return 'month'    
    
df['amount_oldbalanceOrg'] = df.apply(amount_oldbalanceOrg, axis = 1)
df['orig_transaction_error'] = df.apply(orig_transaction_error, axis = 1)    
df['dest_transaction_error'] = df.apply(dest_transaction_error, axis = 1)
df['orig_dest'] = df.apply(get_name_prefix, axis = 1)
df['transaction_duration'] = df.apply(transaction_duration, axis = 1)

In [None]:
df.info(memory_usage="deep")

* **EDA**:

In [15]:
convert_columns_to_catg(df, column_list=["amount_oldbalanceOrg","orig_transaction_error",
                                        "dest_transaction_error", "orig_dest",
                                        "transaction_duration"])

In [16]:
df.to_csv('processed_data.csv')

In [20]:
trans_type = pd.DataFrame({'isFraud' : (df.groupby('type')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['type'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

transaction_duration = pd.DataFrame({'isFraud' : (df.groupby('transaction_duration')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['transaction_duration'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('orig_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

amount_oldbalanceOrg = pd.DataFrame({'isFraud' : (df.groupby('amount_oldbalanceOrg')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['amount_oldbalanceOrg'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

dest_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('dest_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['dest_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_dest = pd.DataFrame({'isFraud' : (df.groupby('orig_dest')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_dest'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

In [19]:
print('Fraud transaction = {}'.format((df[df['isFraud'] == 1].shape[0] / df.shape[0])*100))
print('Not Fraud transaction = {}'.format((df[df['isFraud'] == 0].shape[0] / df.shape[0])*100))

In [93]:
flag = pd.DataFrame({'isFraud' : (df.groupby('isFlaggedFraud')['isFraud'].agg('sum') /df['isFlaggedFraud'].value_counts()) * 100,
             'count': (df['isFlaggedFraud'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)
flag

In [94]:
flag['isFraud'].to_list()

In [66]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['type'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [67]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.pointplot(x='index',y='isFraud',data=trans_type, palette=['#008000'])

In [58]:
trans_type

In [78]:
trans_type['count'].to_list()

In [68]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['orig_dest'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [80]:
orig_dest 

In [69]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(df['orig_transaction_error'] , palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [53]:
orig_transaction_error

In [70]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(x = 'index',
            y = 'isFraud',
            data = amount_oldbalanceOrg, palette=[ '#6B8E23', "#9ACD32"])

In [72]:
amount_oldbalanceOrg

* **Data cleaning**

In [17]:
%%time

data = pd.read_csv('./processed_data.csv')

In [18]:
# remove useless rows and columns
data = data[data['orig_dest'] == "C-C"]
data = data[(data['type'] == "CASH_OUT") | (data['type'] == "TRANSFER")]
data.shape

In [19]:
data.to_csv("cleaned_data.csv")

In [20]:
print('Fraud transaction = {}'.format((data[data['isFraud'] == 1].shape[0] / data.shape[0])*100))
print('Not Fraud transaction = {}'.format((data[data['isFraud'] == 0].shape[0] / data.shape[0])*100))

In [21]:
data = data.drop([ 'amount_oldbalanceOrg',
       'orig_transaction_error', 'dest_transaction_error', 'orig_dest',
       'transaction_duration'],1)
data.shape

* **Train test split**

In [39]:
train, test, train_target, test_target = train_test_split(
    data.drop("isFraud",1), data["isFraud"], test_size=0.3)

In [40]:
(train_target.value_counts() / train.shape[0])*100

In [41]:
(test_target.value_counts()/ test.shape[0])*100

* **Pre-process the data:**

* **build model, cv and training**

In [26]:
#Binary Cross Entropy/Log Loss

In [37]:
def metrics(run, yvalid, valid_preds):
    score = roc_auc_score(yvalid, valid_preds)
    run['ROC AUC score'] = score
    return score

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 1,600 rounds (with early stopping after 70)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=70, feval=gini_xgb, maximize=True, verbose_eval=100)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)
    sub['target'] += p_test/kfold

In [None]:
def run_model(run, model, description, key, cat_features=''):
    final_preds = []
    if key =='LGB':
        #Description
        run["Description"] = description
        #Training session
        for i, (train_index, test_index) in enumerate(rkf.split(X)):
            X_train, X_val = X[train_index], X[test_index]
            y_train, y_val = y[train_index], y[test_index]
            xtest = test.copy()
            
            start = timeit.default_timer()
            model.fit(X_train,y_train, categorical_feature=cat_features)
            stop = timeit.default_timer()
            
            run['Training time'] = stop - start
             
            #Prediction session
            valid_preds = model.predict(X_val)
            start = timeit.default_timer()
            test_preds = model.predict(xtest)
            stop = timeit.default_timer()
            
            run['Prediction time'] = stop - start
            
            final_preds.append(test_preds)
            
            #Performance evaluation
            score = metrics(y_pred_test)
            print("Fold : {} ==> Score: {}".format(fold + 1, score))
            
    elif key =='CAT':
        #Description
        run["Description"] = description 
        #Training session
        start = timeit.default_timer()
        model.fit(X_train,y_train,
             eval_set=(X_test, y_test),
             cat_features=cat_features,
             use_best_model=True)
        stop = timeit.default_timer()
        run['Training time'] = stop - start 
        #Prediction session
        start = timeit.default_timer()
        y_pred_test = model.predict(X_test)
        stop = timeit.default_timer()
        run['Prediction time'] = stop - start
        #Performance evaluation
        metrics(y_pred_test)
    else:
        #Description
        run["Description"] = description
        #Training session
        start = timeit.default_timer()
        model.fit(X_train,y_train)
        stop = timeit.default_timer()
        run['Training time'] = stop - start
        #Prediction session
        start = timeit.default_timer()
        y_pred_test = model.predict(X_test)
        stop = timeit.default_timer()
        run['Prediction time'] = stop - start
        #Performance evaluation
        metrics(y_pred_test)


In [50]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)