In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# library imports

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import joblib
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.simplefilter("ignore")

# Data loading and basic EDA

In [3]:
df=pd.read_csv("/kaggle/input/plentina-challenge/transactions_train.csv")

In [4]:
df.head()

In [5]:
df.isna().sum()

In [6]:
len(df)-len(df.drop_duplicates())

In [None]:
df['type'].value_counts().plot(kind='bar')

In [7]:
sns.heatmap(df.corr(),annot=True)

In [8]:
plt.figure()
plt.subplot(211)
df.loc[df['isFraud']==1,['step']].value_counts()[:50].plot(kind='bar')
plt.subplot(212)
df.loc[df['isFraud']==1,['step']].value_counts()[-50:].plot(kind='bar')

In [9]:
plt.figure(figsize=(30, 30))
df.loc[df['isFraud']==1,['step']].value_counts()[:50].plot(kind='bar',rot=90)

In [10]:
df.loc[df['isFraud']==1,['step']].value_counts()

In [None]:
df.loc[df['isFraud']==1,['step']].value_counts().index.max()

In [None]:
df.loc[df['isFraud']==1,['step']].value_counts()[:25]

In [None]:
df.loc[df['isFraud']==1,['step']].value_counts()[-25:]

In [None]:
df['step'].value_counts()

# train-test split, feature engineering and pre processing
- a new feature difference in reciever's balance has been generated using 
    - dif_dest=newbalanceDest - oldbalanceDest
- a new feature difference in senders balance has been generated using 
     - amount_dif=newbalanceOrig-amount
- train-test split has been done using stratified technique.
   - train data has been used for k fold validation
   - test data has been used as hold out for later validation

In [11]:
df['dif_dest']=df['newbalanceDest']-df['oldbalanceDest']
df['amount_dif']=df['newbalanceOrig']-df['amount']
x_train = df.drop(['isFraud'], axis=1)
y_train =  df['isFraud']
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42,stratify=y_train)

In [None]:
df.loc[df['dif_dest']>0]['isFraud'].value_counts()

In [None]:
df.loc[df['amount_dif']>0]['isFraud'].value_counts()

**Preprocessing Steps:**
- Ordinal Encoders has been used for categorical variables
- Quantile transformers has been used for numerical variables.

In [31]:
cat_col=['type']
num_col=['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'dif_dest', 'amount_dif']

In [12]:
def preprocess(tc,tt,y_c,y_t):
    '''
    tc: X_train
    tt: X_test
    yc: y_train
    yt: y_test
    '''
    cat_col=['type']
    num_col=['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'dif_dest', 'amount_dif']
    
    le=preprocessing.OrdinalEncoder()
    scaler=preprocessing.QuantileTransformer()
    concatinated=pd.concat([tc[cat_col],tt[cat_col]])
    le.fit(concatinated)
    le_train=le.transform(tc[cat_col])
    le_test=le.transform(tt[cat_col])
    
    scaler.fit(tc[num_col])
    scale_train=scaler.transform(tc[num_col])
    scale_test=scaler.transform(tt[num_col])
    train_X=np.concatenate([le_train,scale_train],axis=1)
    test_X=np.concatenate([le_test,scale_test],axis=1)
    
    y_train=y_c.values
    y_test=y_t.values
    
    joblib.dump(scaler, "scale2.joblib", 9)
    joblib.dump(le,"le2.joblib",9)
    
    return train_X,test_X,y_train,y_test

In [13]:
train_X,test_X,y_train,y_test=preprocess(X_train, X_test, Y_train, Y_test)

In [14]:
print(train_X.shape,test_X.shape,y_train.shape,y_test.shape)

**Evaluation Metric**
- since the data is imbalaced roc-auc has been used for evaluation

In [15]:
def loss(y_true,y_pred):
    '''
    objective: calcualtes the roc_auc.
    y_true: true labels.
    y_pred: predicted labels.
    
    '''
    score=roc_auc_score(y_true,y_pred)
    return score

# Stratified K-Fold Cross Validation
- stratified k fold has been used due to data imbalance
- 5 fold is due to the large data size

In [16]:
def validation(X_train,y_train,model):
    '''
    objective: validation loop to validate the model performace.
    X_train: independent variables of training data.
    y_train: dependent/target variable of training data.
    
    '''
    kfold=StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
    train_los=0
    test_los=0
    for fold,(train_idx, val_idx) in enumerate(kfold.split(X_train,y_train)):
        train_x=X_train[train_idx]
        train_y=y_train[train_idx]
        test_x=X_train[val_idx]
        test_y=y_train[val_idx]
        model.fit(train_x,train_y)
        pred_test=model.predict(test_x)
        pred_train=model.predict(train_x)
        train_los+=loss(train_y,pred_train)
        test_los+=loss(test_y,pred_test)
        print(fold,'train loss------->',loss(train_y,pred_train))
        print(fold,'test loss------->',loss(test_y,pred_test))
    print('train',train_los/5)
    print("test",test_los/5)
    return "----------end--------------"

In [17]:
model=DecisionTreeClassifier(min_samples_split=7,min_samples_leaf=5,max_depth=None,random_state=42)
validation(train_X,y_train,model)

In [32]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, 
                   feature_names=cat_col+num_col,  
                   class_names=['1','0'],
                   filled=True)

plt.savefig('foo2.png')

# Hyper Parameter Tuning
- randomized search cv is used for the hyper paramter tuning.
- i have avoided gridserachcv due to the large data size.
- p.s: i have commented the below cell because it takes alot of time to commit and save the notebook.

In [None]:
# custom_scorer = make_scorer(loss, greater_is_better=True)
# model=DecisionTreeClassifier(random_state=42)
# kfold = StratifiedKFold(5,shuffle=False, random_state=3)
# gsc = RandomizedSearchCV(
#     estimator=model,
#     param_distributions={
#       "max_depth": [3,5,10,15,20,None],
#     "min_samples_split": [2,5,7,10],
#     "min_samples_leaf": [1,2,5],
#     'criterion': ["gini", "entropy"]
# }
#     ,
#     scoring=custom_scorer,
#     cv=kfold, n_jobs=-1
# )

# grid_result = gsc.fit(train_X, y_train)

# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [34]:
total_col=num_col+cat_col

In [35]:
pd.DataFrame([model.feature_importances_],columns=total_col).plot.barh(figsize=(10,10))

In [36]:
model=DecisionTreeClassifier(min_samples_split=7,min_samples_leaf=5,max_depth=None,random_state=42)
model.fit(train_X,y_train)
pred_train=model.predict(train_X)
pred_test=model.predict(test_X)
print('train',loss(pred_train,y_train))
print('test',loss(pred_test,y_test))

In [37]:
# confusion matrix for the train set
confusion_matrix(pred_train,y_train)

In [39]:
from sklearn.metrics import classification_report
print(classification_report(pred_train, y_train, target_names=['0','1']))

In [38]:
# confusion matrix for the test set
confusion_matrix(pred_test,y_test)

In [40]:
from sklearn.metrics import classification_report
print(classification_report(pred_test, y_test, target_names=['0','1']))

In [None]:
#model export
joblib.dump(model, "model2.joblib", 9)
# joblib.dump(scaler, "scale.joblib", 9)
# joblib.dump(le,"le.joblib",9)
# mod=joblib.load("scale.pkl", mmap_mode=None)
# scale_test2=mod.transform(tt[num_col])

In [None]:
# single value inference
val=[[1,'CASH_OUT',181,'C1305486145',181,0,'C553264065',0,0]]
df2=pd.DataFrame(val,columns=df.columns[df.columns!='isFraud'])
df2['dif_dest']=df2['newbalanceDest']-df2['oldbalanceDest']
df2['amount_dif']=df2['newbalanceOrig']-df2['amount']
le_df2=le.transform(df2[cat_col])
scale_df2=scaler.transform(df2[num_col])
train_df2=np.concatenate([le_df2,scale_df2],axis=1)
model.predict(train_df2)