In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
import tensorflow as tf
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import xgboost as xgb

In [None]:
data = pd.read_csv("/kaggle/input/fraud-detection-datafest/Copy of FraudDetectionDataset.csv")
data_copy = data.copy()

In [None]:
data.dtypes

# **Data Visualization and Feature Engineering**

In [None]:
numeric_features  =['Transaction ID','User ID','Transaction Amount',
                    'Merchant ID','User Age','User Income','Location Distance',
                    'Time Taken for Transaction',"User's Transaction History",
                    "Merchant's Reputation Score","User's Credit Score",
                    "Merchant's Business Age"] 

all_features = ['Transaction ID', 'User ID', 'Transaction Amount',
       'Transaction Date and Time', 'Merchant ID', 'Payment Method',
       'Country Code', 'Transaction Type', 'Device Type',
       'Browser Type', 'Operating System', 'Merchant Category', 'User Age',
       'User Occupation', 'User Income', 'User Gender', 'User Account Status',
       'Transaction Status', 'Location Distance', 'Time Taken for Transaction',
       'Transaction Time of Day', "User's Transaction History",
       "Merchant's Reputation Score", "User's Device Location",
       'Transaction Currency', 'Transaction Purpose', "User's Credit Score",
       "User's Email Domain", "Merchant's Business Age",
       'Transaction Authentication Method']
categorical_features = list(set(all_features)-set(numeric_features))


In [None]:
data[numeric_features].describe()

In [None]:
data[categorical_features].describe()

In [None]:
sns.countplot(data=data, x="Fraudulent Flag")
plt.show()

In [None]:
#converting the  categorical variables to numbers

def convert(data,columns):
    allmap ={}
    for i in columns:
        unique = list(data[i].unique())
        mapdict ={}
        for p,u in enumerate(unique):
            mapdict[u]=p 
        data[i]=data[i].map(mapdict)
        print("done with:{}".format(i))
        allmap[i]= mapdict
    return mapdict
lookup_table= convert(data,categorical_features)

In [None]:
for i, q in enumerate(categorical_features[0:10]):
    plt.figure(figsize=(20,7))
    sns.scatterplot(data=data, x=q, y="Fraudulent Flag")
    print (i)
    plt.show()

In [None]:
# investigating categories in fradulent transactions

for i, q in enumerate(categorical_features[0:12]):
    plt.figure(figsize=(20,7))
    dat_fraud =data[(data["Fraudulent Flag"]==1)][q]
    sns.distplot(dat_fraud )
    print (i)
    plt.show()

**From all indications this the Fradulent cases appear to be uniformly distrbuted between the various classes in the categorical columns of the dataset with almost no glaring predictors**

In [None]:
# investigating categories in non-fradulent transactions

for i, q in enumerate(categorical_features[0:12]):
    plt.figure(figsize=(20,7))
    dat_fraud =data[(data["Fraudulent Flag"]==0)][q]
    sns.distplot(dat_fraud )
    print (i)
    plt.show()

**From all indications this the non fradulent cases appear to be uniformly distrbuted between the various classes in the categorical columns just like we saw in the fradulent one**

# Data preparation and categorical preprocessing with Target Encoding 

In [None]:
from category_encoders import TargetEncoder
encodeX= data.sample(frac=0.548, random_state=945)
encodeY= encodeX["Fraudulent Flag"]
for cla in categorical_features:
    encoder = TargetEncoder()
    encoder.fit(encodeX[cla], encodeY)
    data[cla] = encoder.transform(data[cla])

In [None]:
data.head()

In [None]:
data.head()

In the cell above we created a small sample from the entire datset to fit our encoding model. This is to prevent data leakage.

# Model Training and Evaluation

 ## Approach 1- Plain Logistic Regression classifier

In [None]:
#the Data wil be scaled using Standard Scaler and Dimensionality reduction will be carried out using Princpal Componet Analysis 
pca = PCA(n_components=15)
x= data[all_features]
Scaler = StandardScaler()
x = Scaler.fit_transform(x)
x=pca.fit_transform(x)
y= data["Fraudulent Flag"].values

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state =97)
clf=LogisticRegression(solver ="lbfgs", random_state=23).fit(x_train,y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

fig, ax =plt.subplots()
sns.heatmap(confusion_matrix(y_test,y_pred, normalize='true'), annot =True,ax= ax)
ax.set_title("Confusion Matrix")
ax.set_ylabel("Real value")
ax.set_xlabel("predicted value")
plt.show()

With this model we get a model that's about 70% accurate at identifying both legitimate and fraudulent transactions

 ## Approach 2- Plain Decision Tree classifier

In [None]:
tree=DecisionTreeClassifier(max_depth=7,random_state=1).fit(x_train,y_train)

y_pred3 = tree.predict(x_test)
print(classification_report(y_test,y_pred3))

fig, ax =plt.subplots()
sns.heatmap(confusion_matrix(y_test,y_pred3, normalize='true'), annot =True,ax= ax)
ax.set_title("Confusion Matrix")
ax.set_ylabel("Real value")
ax.set_xlabel("predicted value")
plt.show()

 ## Approach 4- Ridge Regression and grid search

In [None]:
#we are iterating over the varioues values for the alpha parameter to pick the best performer
alphas = list(np.arange(0.01, 0.9, 0.05))
print (alphas)
model=[]
accuracies =[]
seeds = []
for a in alphas:
    seed =random.randint(1,1000)
    seeds.append(seed)
    ridge = RidgeClassifier(alpha =a, random_state=seed, solver="saga").fit(x_train, y_train)
    y_pred4 =ridge.predict(x_test)
    print(classification_report(y_test,y_pred4))
    score =accuracy_score(y_test,y_pred4)

    fig, ax =plt.subplots()
    sns.heatmap(confusion_matrix(y_test,y_pred4, normalize='true'), annot =True,ax= ax)
    ax.set_title("Confusion Matrix")
    ax.set_ylabel("Real value")
    ax.set_xlabel("predicted value")
    plt.show()
    accuracies.append(score)
    model.append(ridge)

**After Runnning the grid search we got an accuracy and f1 score of 70% for all values of the hyperparameter alpha basically the same** 

 ## Approach 2- Xgb classifier

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=970)
xgb_model.fit(x_train, y_train)

y_pred2 = xgb_model.predict(x_test)
print(classification_report(y_test,y_pred2))

fig, ax =plt.subplots()
sns.heatmap(confusion_matrix(y_test,y_pred2, normalize='true'), annot =True,ax= ax)
ax.set_title("Confusion Matrix")
ax.set_ylabel("Real value")
ax.set_xlabel("predicted value")
plt.show()

**These results are almost identical to other models. In the next cell we will vary two hyperparameters: learning rate and min child weight to see if they can improve performance **

In [None]:
#grid search with Xgb classifier
# new learning rate range
learning_rate_range = np.arange(0.01, 0.2, 0.05)
fig = plt.figure(figsize=(19, 17))
idx = 1
print("Total number of trials:{}".format(len(np.arange(0, 3, 0.5)*len(learning_rate_range))))
# grid search for min_child_weight
for s, weight in enumerate(np.arange(0, 3, 0.5)):
    
    train = []
    test = []
    for lr in learning_rate_range:
        xgb_classifier = xgb.XGBClassifier(objective="binary:logistic",eta = lr, reg_lambda=1, min_child_weight=weight,random_state=900)
        xgb_classifier.fit(x_train, y_train)
        train.append(xgb_classifier.score(x_train, y_train))
        test.append(xgb_classifier.score(x_test, y_test))
    print("done with {} trials".format(len(learning_rate_range)*(s+1)))
    fig.add_subplot(3, 3, idx)
    idx += 1
    plt.plot(learning_rate_range, train, c='orange', label='Training')
    plt.plot(learning_rate_range, test, c='m', label='Testing')
    plt.xlabel('Learning rate')
    plt.xticks(learning_rate_range)
    plt.ylabel('Accuracy score')
    plt.ylim(0.6, 1)
    plt.legend(prop={'size': 12}, loc=3)
    title = "Min child weight:" + str(weight)
    plt.title(title, size=16)
plt.show()

In [None]:
print(test)

 ## Approach 4- Autoencoder representations feeding a decision tree

This approach trains an autoencoder which is a feed forward network whose input and output layers have the same shape and whose goal is to try and accurately transform the data into a richer representation. 
Firstly, the preprocessing creates two variables x_norm and x_fraud which correspond to the features of records where the transaction is Genuine and Fraudulent.
Next, the autoencoder is trained on x_norm alone(so it can better distinguish the unseen x_fraud features) and weights of the first 3 layers are added to a new feedforward network which predicts the hidden representation of x_norm and x_train
the new representations of x_norm nd x_fraud are fed into a decision tree classifier

In [None]:
# this cell contains all our neural network helper functions
def preprocessing(data,features):
    pca = PCA(n_components=8)
    dat_norm = data[(data["Fraudulent Flag"]== 1)][features]
    dat_fraud =data[(data["Fraudulent Flag"]== 1)][features]
    inne = pd.concat([dat_norm,dat_fraud])
    scaly = StandardScaler()
    dat= scaly.fit(inne.values)
    x_norm = scaly.transform(dat_norm)
    x_fraud = scaly.transform(dat_fraud) 
    x_norm=pca.fit_transform(x_norm)
    x_fraud=pca.transform(x_fraud)
    del(inne)
    return x_norm, x_fraud


def build_net_no_dropout(input_size):
    model = Sequential()
    model.add(Dense(input_size,activation = 'relu'))
    model.add(tf.Flatten())
    model.add(Dense(64, activation = "tanh"))
    model.add(Dense(128, activation = "relu"))
    model.add(Dense(32, activation = "tanh"))
    model.add(Dense(1, activation = 'sigmoid'))
    return model

def build_autoencoder(input_size):
    model = Sequential()
    model.add(Dense(input_size,activation = 'tanh'))
    model.add(Dense(150, activation = "tanh"))
    model.add(Dense(26, activation = "relu"))
    model.add(Dense(150, activation = "tanh"))
    model.add(Dense(150, activation = "tanh"))
    model.add(Dense(input_size, activation = 'relu'))
    return model

def build_net_with_dropout(input_size):
    model = Sequential()
    model.add(Dense(input_size,activation = 'relu',kernel_regularizer = tf.keras.regularizers.L1(0.01)))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(Dense(128, activation = "relu",kernel_regularizer = tf.keras.regularizers.L1(0.01)))
    model.add(tf.keras.layers.Dropout(0.7))
    model.add(Dense(64, activation = "tanh",kernel_regularizer = tf.keras.regularizers.L1(0.01)))
    model.add(Dense(1, activation = 'sigmoid'))
    return model

def model_run(model,train_x,val_y, train_y, val_x, num_epochs):
    loss = tf.keras.losses.BinaryCrossentropy()
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)
    model.compile( loss= loss, optimizer = "adadelta", metrics = ["accuracy"])
    #training
    history = model.fit(train_x,train_y, epochs = num_epochs,batch_size= 70, verbose = 1,validation_data = (val_x,val_y))
    return history

def auto_model_run(model,train_x, num_epochs):
    model.compile( loss= "mse", optimizer = "adadelta", metrics = ["accuracy"])
    #training
    history = model.fit(train_x,train_x,epochs = num_epochs,batch_size= 400, verbose = 1,validation_split=0.2)
    return history
def plotter(history,metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric])
    plt.xlabel('Epochs')
    plt.ylabel(metric)
    plt.legend([metric,'val_'+metric])
    plt.show()


In [None]:
#autoencoder
x_norm,x_fraud =preprocessing(data,all_features) 
autoencoder = build_autoencoder(x_norm.shape[1])
hist_auto = auto_model_run(autoencoder,x_norm,20)
plotter(hist_auto, "accuracy")
plotter(hist_auto, "loss")

In [None]:
#hidden representation prediction
hidden_representation =Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [None]:
#predicting the hidden representation and feeding it to a classifier
norm_hid_rep = hidden_representation.predict(x_norm)
fraud_hid_rep = hidden_representation.predict(x_fraud)
rep_x = np.vstack((norm_hid_rep,fraud_hid_rep))
rep_y = np.hstack((np.zeros(len(x_norm)),np.ones(len(x_fraud))))
x_train2,x_test2,y_train2,y_test2 = train_test_split(rep_x,rep_y,random_state =22)

clfauto=LogisticRegression(solver ="lbfgs").fit(x_train2,y_train2)
y_predauto = clfauto.predict(x_test2)
print(classification_report(y_test2,y_predauto))

fig, ax =plt.subplots()
sns.heatmap(confusion_matrix(y_test2,y_predauto, normalize='true'), annot =True,ax= ax)
ax.set_title("Confusion Matrix")
ax.set_ylabel("Real value")
ax.set_xlabel("predicted value")
plt.show()

Apparently this model fell short of expectations,albeit it seemed to have distinctively recognized the fraudulent and non fraudulent transaction As expected. suspect thant some hyperparameter tunning could help fix the accuracy

 ## Approach 5-  CatBoost

In [None]:
from catboost import CatBoostClassifier
X=data_copy[all_features]
y=data_copy["Fraudulent Flag"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

cat_features =[all_features.index(w) for w in categorical_features]
cat = CatBoostClassifier(
    iterations=26, 
    learning_rate=0.7, 
    loss_function='CrossEntropy'
)


cat.fit(X_train, y_train, 
        cat_features=cat_features, 
        eval_set=(X_val, y_val), 
        verbose=5
)

In [None]:
print(classification_report(y_val,cat.predict(X_val)))

**In all it appears that most of our models have similar accuracy and F1 scores, therefore any of them can be used to classify fradulent transactions. That being said there's always room for improvement and with more compute and newer algorithims we will be able to get better**