In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
import gc

In [86]:
%%time

df = pd.read_csv('../input/paysim1/PS_20174392719_1491204439457_log.csv')

In [87]:
df.info(memory_usage="deep")

In [88]:
df.memory_usage(deep=True) * 1e-6

In [89]:
def convert_columns_to_catg(df, column_list):
    for col in column_list:
        print("converting", col.ljust(30), "size: ", round(df[col].memory_usage(deep=True)*1e-6,2), end="\t")
        df[col] = df[col].astype("category")
        print("->\t", round(df[col].memory_usage(deep=True)*1e-6,2))

In [90]:
convert_columns_to_catg(df, column_list=["nameDest","type"])

In [91]:
def downcast_df_int_columns(df):
    list_of_columns = list(df.select_dtypes(include=["int32", "int64"]).columns)
        
    if len(list_of_columns)>=1:
        max_string_length = max([len(col) for col in list_of_columns]) # finds max string length for better status printing
        print("downcasting integers for:", list_of_columns, "\n")
        
        for col in list_of_columns:
            print("reduced memory usage for:  ", col.ljust(max_string_length+2)[:max_string_length+2],
                  "from", str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8), "to", end=" ")
            df[col] = pd.to_numeric(df[col], downcast="integer")
            print(str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8))
    else:
        print("no columns to downcast")
    
    gc.collect()
    
    print("done")

In [92]:
downcast_df_int_columns(df)

In [93]:
df.info(memory_usage="deep")

In [94]:
df.memory_usage(deep=True) * 1e-6

In [95]:
%%time
def amount_oldbalanceOrg(row):
    if row['oldbalanceOrg'] - row['amount'] == 0:
        return 'equal'
    else: 
        return 'not equal'
    
def dest_transaction_error(row):
    if row['newbalanceDest'] - row['oldbalanceDest'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def orig_transaction_error(row):
    if row['oldbalanceOrg'] - row['newbalanceOrig'] - row['amount'] != 0:
        return "error"
    else:
        return "no error"
    
def get_name_prefix(row):
    return row['nameOrig'][0] + '-' + row['nameDest'][0]

def transaction_duration(row):
    if row['step'] / 24 < 1:
        return 'less than one day'
    elif row['step'] / 168 < 1:
        return 'less than a week'
    elif row['step'] / 744 < 1:
        return 'less than a month' 
    else:
        return 'month'    
    
df['amount_oldbalanceOrg'] = df.apply(amount_oldbalanceOrg, axis = 1)
df['orig_transaction_error'] = df.apply(orig_transaction_error, axis = 1)    
df['dest_transaction_error'] = df.apply(dest_transaction_error, axis = 1)
df['orig_dest'] = df.apply(get_name_prefix, axis = 1)
df['transaction_duration'] = df.apply(transaction_duration, axis = 1)

In [145]:
df.info(memory_usage="deep")

In [144]:
convert_columns_to_catg(df, column_list=["amount_oldbalanceOrg","orig_transaction_error",
                                        "dest_transaction_error", "orig_dest",
                                        "transaction_duration"])

In [146]:
df.to_csv('processed_data.csv')

In [96]:
print('Fraud transaction = {}'.format((df[df['isFraud'] == 1].shape[0] / df.shape[0])*100))
print('Not Fraud transaction = {}'.format((df[df['isFraud'] == 0].shape[0] / df.shape[0])*100))

In [104]:
trans_type = pd.DataFrame({'isFraud' : (df.groupby('type')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['type'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

transaction_duration = pd.DataFrame({'isFraud' : (df.groupby('transaction_duration')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['transaction_duration'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('orig_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

amount_oldbalanceOrg = pd.DataFrame({'isFraud' : (df.groupby('amount_oldbalanceOrg')['isFraud'].agg('sum') /8213) * 100,
            'count': (df['amount_oldbalanceOrg'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

dest_transaction_error = pd.DataFrame({'isFraud' : (df.groupby('dest_transaction_error')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['dest_transaction_error'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

orig_dest = pd.DataFrame({'isFraud' : (df.groupby('orig_dest')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['orig_dest'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)

In [131]:
flag = pd.DataFrame({'isFraud' : (df.groupby('isFlaggedFraud')['isFraud'].agg('sum') /8213) * 100,
             'count': (df['isFlaggedFraud'].value_counts()/df.shape[0]) * 100}).reset_index(level=0)
flag

In [110]:
sns.countplot(df['type'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [114]:
sns.pointplot(x='index',y='isFraud',data=trans_type, palette=['#008000'])

In [117]:
sns.countplot(df['orig_dest'], palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [118]:
orig_dest 

In [119]:
sns.countplot(df['orig_transaction_error'] , palette=['#008000', '#556B2F', '#808000', '#6B8E23', "#9ACD32"])

In [141]:
orig_transaction_error

In [140]:
sns.barplot(x = 'index',
            y = 'isFraud',
            data = amount_oldbalanceOrg, palette=[ '#6B8E23', "#9ACD32"])

In [132]:
amount_oldbalanceOrg