In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv('transactions_train.csv')

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from pickle import load,dump
import plotly.graph_objects as go
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer,f1_score,fbeta_score,precision_score,recall_score,confusion_matrix
from skopt import BayesSearchCV,gbrt_minimize,gp_minimize
from skopt.space import Integer,Real,Categorical
from skopt.utils import use_named_args

In [None]:
#Feature Engineering Function
def transform_data(df):
    new_df=pd.DataFrame()
    
    #if the origin account has no initial balance
    y=np.where(df['oldbalanceOrig']>0,df['oldbalanceOrig'],1)
    x=df['newbalanceOrig']
    #percentage inflow/outlflow of account
    new_df['percentage_diff_balanceOrig']=np.round(((x/y)-1)*100,3)
    #amount in the account before transaction: Is there an initial balance in the account
    new_df['NoAmountBalanceOrig']=np.where(df['oldbalanceOrig']>0,1,0)

    #if the destination account has no initial balance
    y=np.where(df['oldbalanceDest']>0,df['oldbalanceDest'],1)
    x=df['newbalanceDest']
    #percentage inflow/outlflow of the account
    new_df['percentage_diff_balanceDest']=np.round(((x/y)-1)*100,3)
    #amount in the account before transaction: Is there an initial balance in the account
    new_df['NoAmountBalanceDest']=np.where(df['oldbalanceDest']>0,1,0)

    #Transaction Inflow or Outlfow to each account 
    #1:Inflow
    #0: No Change
    #-1: Outflow
    new_df['signbalanceOrig']=np.sign(df['newbalanceOrig']-df['oldbalanceOrig'])
    new_df['signbalanceDest']=np.sign(df['newbalanceDest']-df['oldbalanceDest'])

    x=np.where(df['amount']>0,df['amount'],1)
    new_df['AmountBalanceDiffOrig']=np.absolute((df['newbalanceOrig']-df['oldbalanceOrig']))/x
    new_df['AmountBalanceDiffDest']=np.absolute((df['newbalanceDest']-df['oldbalanceDest']))/x

    new_df['AmountBalanceDiffOrig']=new_df['AmountBalanceDiffOrig'].round(3)
    new_df['AmountBalanceDiffDest']=new_df['AmountBalanceDiffDest'].round(3)

    new_df['isAmount']=np.where(df['amount']>0,1,0)
    
    
    xi=df[['step','type']]
    xii=new_df[['percentage_diff_balanceOrig', 'NoAmountBalanceOrig','percentage_diff_balanceDest', 'NoAmountBalanceDest', 'signbalanceOrig','signbalanceDest', 'AmountBalanceDiffOrig', 'AmountBalanceDiffDest','isAmount']]
    
    return pd.concat([xi,xii],axis=1)

In [None]:
X=transform_data(df).values

In [None]:
Y=df['isFraud'].values

In [None]:
para={'cat__bagging_temperature': 0.25837166197274813,
 'cat__border_count': 16,
 'cat__depth': 17,
 'cat__iterations': 758,
 'cat__l2_leaf_reg': 27.557949627695663,
 'cat__learning_rate': 0.014570946584675377,
 'cat__max_leaves': 35,
 'cat__min_data_in_leaf': 26,
 'cat__random_strength': 0.005257823862746219,
 'cat__scale_pos_weight': 302.93935619924673}

In [None]:
model=model.fit(X,Y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
cat=CatBoostClassifier(loss_function='Logloss',grow_policy='Lossguide',bootstrap_type='Bayesian',task_type='GPU',metric_period=25,verbose=False)
ct = ColumnTransformer([("ONEHOT",OneHotEncoder(sparse=False,dtype=int), [1])],remainder='passthrough')
model=Pipeline([('column_transform',ct),('cat',cat)])
model=model.set_params(**para)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
model=model.fit(X_train,y_train)

In [None]:
model.get_params()

In [None]:
#Hyperparameter Optimization 

#CATBOOST ALGORITHM
#Lossguide
params={
        'cat__iterations':Integer(50,1000),
        'cat__depth':Integer(12,50),
        'cat__learning_rate':Real(0.01,1,'log-uniform'),
        'cat__random_strength': Real(1e-9, 5, 'log-uniform'), # randomness for scoring splits
        'cat__bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
        'cat__border_count': Integer(2, 500), # splits for numerical features
        'cat__l2_leaf_reg':Real(1,50),
        'cat__min_data_in_leaf':Integer(3,50),
        'cat__max_leaves':Integer(2,75),
        'cat__scale_pos_weight':Real(1,1200)
        #'subsample':Real(0.5, 1.0),
        }
#Cross Validation  
#StratifiedKFold
cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

#Classifier Algorithm
cat=CatBoostClassifier(loss_function='Logloss',grow_policy='Lossguide',bootstrap_type='Bayesian',task_type='GPU',metric_period=25,verbose=False)
ct = ColumnTransformer([("ONEHOT",OneHotEncoder(sparse=False,dtype=int), [1])],remainder='passthrough')
model=Pipeline([('column_transform',ct),('cat',cat)])

#Bayesian Optimization
search=BayesSearchCV(model,search_spaces=params,cv=cv,scoring=frecall,n_jobs=1,verbose=3,n_iter=10,n_points=1,optimizer_kwargs={'base_estimator': 'GBRT','n_jobs':-1},random_state=0)
search_best=search.fit(X,Y)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(model, X_test, y_test, name="GBDT")


_ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
dump(model,open('model.pkl','wb'))