In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from tqdm import tqdm

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.savefig('confusion.png')
    plt.show()

In [None]:
df = pd.read_csv('/content/idc4022cMod8Data.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})

In [None]:
df.head()

In [None]:
df.loc[df.isFraud == 1].type.drop_duplicates().values

In [None]:
df = df[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]

In [None]:
len(df)

In [None]:
df.loc[(df.isFraud == 1) & (df.type == 'TRANSFER')].amount.median()

In [None]:
df.loc[(df.isFraud == 0) & (df.type == 'TRANSFER')].amount.median()

In [None]:
df['Fraud_Heuristic'] = np.where(((df['type'] == 'TRANSFER') &
                                  (df['amount'] > 200000)),1,0)

In [None]:
df['Fraud_Heuristic'].sum()

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_pred=df['Fraud_Heuristic'],y_true=df['isFraud'])

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_pred=df['Fraud_Heuristic'],y_true=df['isFraud'])

In [None]:
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)

In [None]:
df.shape

In [None]:
df['hour'] = df['step'] % 24

In [None]:
frauds = []
genuine = []
for i in range(24):
    f = len(df[(df['hour'] == i) & (df['isFraud'] == 1)])
    g = len(df[(df['hour'] == i) & (df['isFraud'] == 0)])
    frauds.append(f)
    genuine.append(g)

In [None]:
sns.set_style("white")

fig, ax = plt.subplots(figsize=(10,6))
gen = ax.plot(genuine/np.sum(genuine), label='Genuine')
fr = ax.plot(frauds/np.sum(frauds),dashes=[5, 2], label='Fraud')
#frgen = ax.plot(np.devide(frauds,genuine),dashes=[1, 1], label='Fraud vs Genuine')
plt.xticks(np.arange(24))
legend = ax.legend(loc='upper center', shadow=True)
fig.savefig('time.png')

In [None]:
sns.set_style("white")

fig, ax = plt.subplots(figsize=(10,6))
#gen = ax.plot(genuine/np.sum(genuine), label='Genuine')
#fr = ax.plot(frauds/np.sum(frauds),dashes=[5, 2], label='Fraud')
frgen = ax.plot(np.divide(frauds,np.add(genuine,frauds)), label='Share of fraud')
plt.xticks(np.arange(24))
legend = ax.legend(loc='upper center', shadow=True)
fig.savefig('time_comp.png')

In [None]:
dfFraudTransfer = df[(df.isFraud == 1) & (df.type == 'TRANSFER')]

In [None]:
dfFraudCashOut = df[(df.isFraud == 1) & (df.type == 'CASH_OUT')]

In [None]:
dfFraudTransfer.nameDest.isin(dfFraudCashOut.nameOrig).any()

In [None]:
dfNotFraud = df[(df.isFraud == 0)]

In [None]:
dfFraud = df[(df.isFraud == 1)]

In [None]:
dfFraudTransfer.loc[dfFraudTransfer.nameDest.isin(
    dfNotFraud.loc[dfNotFraud.type == 'CASH_OUT'].nameOrig.drop_duplicates())]

In [None]:
len(dfFraud[(dfFraud.oldBalanceDest == 0) & (dfFraud.newBalanceDest == 0) & (dfFraud.amount)]) / (1.0 * len(dfFraud))

In [None]:
len(dfNotFraud[(dfNotFraud.oldBalanceDest == 0) & (dfNotFraud.newBalanceDest == 0) & (dfNotFraud.amount)]) / (1.0 * len(dfNotFraud))

In [None]:
dfOdd = df[(df.oldBalanceDest == 0) &
           (df.newBalanceDest == 0) &
           (df.amount)]

In [None]:
len(dfOdd[(dfOdd.isFraud == 1)]) / len(dfOdd)

In [None]:
len(dfOdd[(dfOdd.oldBalanceOrig <= dfOdd.amount)]) / len(dfOdd)

In [None]:
len(dfOdd[(dfOdd.oldBalanceOrig <= dfOdd.amount) & (dfOdd.isFraud == 1)]) / len(dfOdd[(dfOdd.isFraud == 1)])

In [None]:
dfOdd.columns

In [None]:
dfOdd.head(20)

In [None]:
df.head()

In [None]:
df['type'] = 'type_' + df['type'].astype(str)

In [None]:
# Get dummies
dummies = pd.get_dummies(df['type'])

# Add dummies to df
df = pd.concat([df,dummies],axis=1)

#remove original column
del df['type']

Predictive modeling with Keras

In [None]:
df = df.drop(['nameOrig','nameDest','Fraud_Heuristic'], axis= 1)

In [None]:
df['isNight'] = np.where((2 <= df['hour']) & (df['hour'] <= 6), 1,0)

In [None]:
df[df['isNight'] == 1].isFraud.mean()

In [None]:
df.head()

In [None]:
df = df.drop(['step','hour'],axis=1)

In [None]:
df.head()

In [None]:
df.columns.values

In [None]:
y_df = df['isFraud']
x_df = df.drop('isFraud',axis=1)

In [None]:
y = y_df.values
X = x_df.values

In [None]:
y.shape

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    test_size=0.1,
                                                    random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

In [None]:
# Log reg
model = Sequential()
model.add(Dense(1, input_dim=9))
model.add(Activation('sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=SGD(learning_rate=1e-5),
              metrics=['acc'])

In [None]:
model.fit(X_train_res,y_train_res,
          epochs=5,
          batch_size=256,
          validation_data=(X_val,y_val))

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[y_pred > 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [None]:
f1_score(y_pred=y_pred,y_true=y_test)

In [None]:
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)

In [None]:
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)

In [None]:
model = Sequential()
model.add(Dense(16,input_dim=9))
model.add(Activation('tanh'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',optimizer=SGD(learning_rate=1e-4), metrics=['acc'])

In [None]:
model.fit(X_train_res,y_train_res,
          epochs=5, batch_size=256,
          validation_data=(X_val,y_val))

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[y_pred > 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [None]:
f1_score(y_pred=y_pred,y_true=y_test)

In [None]:
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)

In [None]:
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)

# Tree based methods

In [None]:
from sklearn.tree import export_graphviz

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
from IPython.display import Image as PImage

#import pydotplus
dot_data = StringIO()
'''export_graphviz(dtree, out_file=dot_data,
                filled=True, rounded=True,
                special_characters=True)'''
with open("tree1.dot", 'w') as f:
     f = export_graphviz(dtree,
                              out_file=f,
                              max_depth = 3,
                              impurity = True,
                              feature_names = list(df.drop(['isFraud'], axis=1)),
                              class_names = ['Genuine', 'Fraud'],
                              rounded = True,
                              filled= True )

#Convert .dot to .png to allow display in web notebook
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])

# Annotating chart with PIL
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype('/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf', 26)
img.save('sample-out.png')
PImage("sample-out.png")


In [None]:
from sklearn.ensemble import  RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=10,n_jobs=-1)
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
f1_score(y_pred=y_pred,y_true=y_test)

In [None]:
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)

In [None]:
import xgboost as xgb

In [None]:
booster = xgb.XGBClassifier(n_jobs=-1)
booster = booster.fit(X_train,y_train)

In [None]:
y_pred = booster.predict(X_test)

In [None]:
f1_score(y_pred=y_pred,y_true=y_test)

In [None]:
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)

# Entity embeddings

In [None]:
# Reload data
df = pd.read_csv('/content/idc4022cMod8Data.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})

In [None]:
df.head()

In [None]:
df = df.drop(['nameDest','nameOrig','step'],axis=1)

In [None]:
df['type'].unique()

In [None]:
map_dict = {}
for token, value in enumerate(df['type'].unique()):
    map_dict[value] = token

In [None]:
map_dict

In [None]:
df["type"].replace(map_dict, inplace=True)

In [None]:
df.head()

In [None]:
other_cols = [c for c in df.columns if ((c != 'type') and (c != 'isFraud'))]

In [None]:
other_cols

In [None]:
from keras.models import Model
from keras.layers import Embedding, Dense, Activation, Reshape, Input, Concatenate

In [None]:
num_types = len(df['type'].unique())
type_embedding_dim = 3

In [None]:
inputs = []
outputs = []

In [None]:
type_in = Input(shape=(1,))
type_embedding = Embedding(num_types,type_embedding_dim,input_length=1)(type_in)
type_out = Reshape(target_shape=(type_embedding_dim,))(type_embedding)

type_model = Model(type_in,type_out)

inputs.append(type_in)
outputs.append(type_out)

In [None]:
num_rest = len(other_cols)

In [None]:
rest_in = Input(shape = (num_rest,))
rest_out = Dense(16)(rest_in)

rest_model = Model(rest_in,rest_out)

inputs.append(rest_in)
outputs.append(rest_out)

In [None]:
concatenated = Concatenate()(outputs)

In [None]:
x = Dense(16)(concatenated)
x = Activation('sigmoid')(x)
x = Dense(1)(concatenated)
model_out = Activation('sigmoid')(x)

In [None]:
merged_model = Model(inputs, model_out)
merged_model.compile(loss='binary_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

In [None]:
types = df['type']

In [None]:
rest = df[other_cols]

In [None]:
target = df['isFraud']

In [None]:
history = merged_model.fit([types.values,rest.values],target.values,
                           epochs = 1, batch_size = 128)

In [None]:
merged_model.summary()