# PLENTINA ML CHALLENGE

You are provided a synthetic dataset for a mobile payments application. In this dataset, you are
provided the sender and recipient of a transaction as well as whether transactions are tagged as
fraud or not fraud. Your task is to build a fraud detection API that can be called to predict
whether or not a transaction is fraudulent.

In [None]:
# Import all necessary python libraries
# Data Manipulation Libraries
import pandas as pd 
import numpy as np
# Machine Learning Libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer,f1_score,fbeta_score,precision_score,recall_score,confusion_matrix
from catboost import CatBoostClassifier
# Hyper-parameter Optimization Library
from skopt import BayesSearchCV,gbrt_minimize,gp_minimize
from skopt.space import Integer,Real,Categorical
from skopt.utils import use_named_args
#Pickle library for saving and loading model
from pickle import load,dump
#Data Visualization Library
#import plotly.graph_objects as go
import plotly.express as px

# Exploratory Data Analysis

In [None]:
#Read synthethic mobile payments dataset using pandas
df=pd.read_csv('transactions_train.csv')

In [None]:
#check the first 10 rows of the data set
df.head(10)

In [None]:
#check dataframe info
df.info()

In [None]:
#check if null
df.isnull().sum()

In [None]:
#describe data frame
df.describe().applymap('{:,.2f}'.format)

In [None]:
#No of unique customers who started transactions
len(df['nameOrig'].unique())

In [None]:
#Unique Customer ID
print(df['nameOrig'].unique())

In [None]:
#Customer ID first character 
print(df.nameOrig.str[0].unique())

In [None]:
#Histogram of the Customer ID length
px.histogram(df,x=df.nameOrig.str.len(),
             title='Histogram: Length of Customer ID',
             labels={'x':'length of customer ID', 'y':'count'})

In [None]:
#No of unique receipients who started transactions
len(df['nameDest'].unique())

In [None]:
#Unique Receipient ID
print(df['nameDest'].unique())

In [None]:
#Receipient ID first character 
print(df['nameDest'].str[0].unique())

In [None]:
px.histogram(df,x=df['nameDest'].str.len(),
             title='Histogram: Length of Customer ID',
             labels={'x':'length of customer ID', 'y':'count'})

# Fraud

In [None]:
#fraudulent transactions by step
df_step=df.groupby(['step']).sum().reset_index()[['step','isFraud']]
fig = px.bar(df_step, x='step', y='isFraud')
fig.show()

In [None]:
#fraudulent transactions by type
df_type=df.groupby(['type']).sum().reset_index()[['type','isFraud']]
fig = px.bar(df_type, x='type', y='isFraud')
fig.show()

In [None]:
#Total No. of Fraudulent Transactions
df_type['isFraud'].sum()

In [None]:
#Total No. of Fraudulent Transactions (Percentage)
np.round(df_type['isFraud'].sum()*100/df.shape[0],2)

In [None]:
#transaction dataframe where transactions are fraudulent
fraud=df[df['isFraud']==1]

In [None]:
px.histogram(fraud,x='amount',
             title='Histogram: Fraudulent Transactions',
             labels={'x':'AMOUNT', 'y':'COUNT'})

In [None]:
px.histogram(fraud,x=fraud['newbalanceOrig']-fraud['oldbalanceOrig'],
             title='Histogram: Fraudulent Transactions',
             labels={'x':'CUSTOMER BALANCE AFTER TRANSACTION', 'y':'COUNT'},
             histnorm='percent' )

In [None]:
px.histogram(fraud,x=fraud['newbalanceDest']-fraud['oldbalanceDest'],
             title='Histogram: Fraudulent Transactions',
             labels={'x':'RECEIPIENT ACCOUNT BALANCE AFTER TRANSACTION', 'y':'COUNT'},
             histnorm='percent' )

# MACHINE LEARNING MODEL

Feature Engineering 

In [None]:
#Feature Engineering Function
def transform_data(df):
    new_df=pd.DataFrame()
    
    #if the origin account has no initial balance
    y=np.where(df['oldbalanceOrig']>0,df['oldbalanceOrig'],1)
    x=df['newbalanceOrig']
    #percentage inflow/outlflow of account
    new_df['percentage_diff_balanceOrig']=np.round(((x/y)-1)*100,3)
    #amount in the account before transaction: Is there an initial balance in the account?
    new_df['NoAmountBalanceOrig']=np.where(df['oldbalanceOrig']>0,1,0)

    #if the destination account has no initial balance
    y=np.where(df['oldbalanceDest']>0,df['oldbalanceDest'],1)
    x=df['newbalanceDest']
    #percentage inflow/outlflow of the account
    new_df['percentage_diff_balanceDest']=np.round(((x/y)-1)*100,3)
    #amount in the account before transaction: Is there an initial balance in the account?
    new_df['NoAmountBalanceDest']=np.where(df['oldbalanceDest']>0,1,0)

    #Transaction Inflow or Outlfow to each account 
    #1:Inflow
    #0: No Change
    #-1: Outflow
    new_df['signbalanceOrig']=np.sign(df['newbalanceOrig']-df['oldbalanceOrig'])
    new_df['signbalanceDest']=np.sign(df['newbalanceDest']-df['oldbalanceDest'])

    x=np.where(df['amount']>0,df['amount'],1)
    new_df['AmountBalanceDiffOrig']=np.absolute((df['newbalanceOrig']-df['oldbalanceOrig']))/x
    new_df['AmountBalanceDiffDest']=np.absolute((df['newbalanceDest']-df['oldbalanceDest']))/x

    new_df['AmountBalanceDiffOrig']=new_df['AmountBalanceDiffOrig'].round(3)
    new_df['AmountBalanceDiffDest']=new_df['AmountBalanceDiffDest'].round(3)

    new_df['isAmount']=np.where(df['amount']>0,1,0)
    
    
    xi=df[['step','type']]
    xii=new_df[['percentage_diff_balanceOrig', 'NoAmountBalanceOrig','percentage_diff_balanceDest', 'NoAmountBalanceDest', 'signbalanceOrig','signbalanceDest', 'AmountBalanceDiffOrig', 'AmountBalanceDiffDest','isAmount']]
    
    return pd.concat([xi,xii],axis=1)

In [None]:
X=transform_data(df).values
X

In [None]:
Y=df['isFraud'].values
Y

Genetic Classification Algorithm: CatBoostClassifier

In [None]:
#Classifier Algorithm
cat=CatBoostClassifier(loss_function='Logloss',
                       grow_policy='SymmetricTree',
                       bootstrap_type='Bayesian',task_type='GPU',
                       metric_period=25,
                       verbose=False)
#ONE HOT ENCODING FOR type column
ct = ColumnTransformer([("ONEHOT",OneHotEncoder(sparse=False,dtype=int), [1])],remainder='passthrough')
#MODEL PIPELINE
model=Pipeline([('column_transform',ct),('cat',cat)])

Hyper Parameter Optimization using Bayesian Search

In [None]:
#Hyperparameter Optimization 
#Algorithm Parameters
params={
        'cat__iterations':Integer(50,1000),
        'cat__depth':Integer(3,16),
        'cat__learning_rate':Real(0.01,1,'log-uniform'),
        'cat__random_strength': Real(1e-9, 5, 'log-uniform'), # randomness for scoring splits
        'cat__bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
        'cat__border_count': Integer(2, 500), # splits for numerical features
        'cat__l2_leaf_reg':Real(1,50),
        'cat__min_data_in_leaf':Integer(3,50),
        #'cat__max_leaves':Integer(2,75),
        'cat__scale_pos_weight':Real(1,1200)
        #'subsample':Real(0.5, 1.0),
        }
#Cross Validation  
#StratifiedKFold
cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)


#Bayesian Optimization
search=BayesSearchCV(model,search_spaces=params,
                     cv=cv,
                     scoring='average_precision',
                     n_jobs=1,verbose=3,
                     n_iter=10,
                     n_points=1,
                     optimizer_kwargs={'base_estimator': 'GBRT','n_jobs':-1},
                     random_state=0)
search_best=search.fit(X,Y)

In [None]:
#Optimized Parameters
para=dict(search_best.best_params_)
para

In [None]:
para={'cat__bagging_temperature': 0.23251402823026496,
 'cat__border_count': 404,
 'cat__depth': 10,
 'cat__iterations': 306,
 'cat__l2_leaf_reg': 39.42720773809657,
 'cat__learning_rate': 0.019116354652261806,
 'cat__min_data_in_leaf': 16,
 'cat__random_strength': 2.8820006880385564e-07,
 'cat__scale_pos_weight': 20.07122072628452}

In [None]:
#Lossguide Parameters
#para={'cat__bagging_temperature': 0.25837166197274813,
# 'cat__border_count': 16,
# 'cat__depth': 17,
# 'cat__iterations': 758,
# 'cat__l2_leaf_reg': 27.557949627695663,
# 'cat__learning_rate': 0.014570946584675377,
# 'cat__max_leaves': 35,
# 'cat__min_data_in_leaf': 26,
# 'cat__random_strength': 0.005257823862746219,
# 'cat__scale_pos_weight': 302.93935619924673}

# MODEL TESTING

In [None]:
#Split Dataset into Train Set and Test Set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)


cat=CatBoostClassifier(loss_function='Logloss',grow_policy='SymmetricTree',bootstrap_type='Bayesian',task_type='GPU',metric_period=25,verbose=False)
ct = ColumnTransformer([("ONEHOT",OneHotEncoder(sparse=False,dtype=int), [1])],remainder='passthrough')
model=Pipeline([('column_transform',ct),('cat',cat)])
model=model.set_params(**para)

model=model.fit(X_train,y_train)

In [None]:
model.get_params()

In [None]:
#Precision Recall Curve for imbalanced data set

from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(model, X_test, y_test, name="GBDT")


_ = display.ax_.set_title("FRAUD DETECTION MODEL:Precision-Recall curve")

In [None]:
#Classification Report
from sklearn.metrics import classification_report

print(classification_report(y_test,model.predict(X_test)))

# SAVE MODEL

In [None]:
#Retrain model on entire data set
model=model.fit(X,Y)

In [None]:
dump(model,open('gbdtmodel.pkl','wb'))