<a href="https://colab.research.google.com/github/AlessandroVol23/ieee_cis_fraud_detection_kaggle/blob/master/notebooks/0.3_AV_Baseline_Model_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Model

In here I'll create a baseline model. 

Kernel https://www.kaggle.com/artkulak/ieee-fraud-simple-baseline-0-9383-lb

I just encode all non-numerical labels 

## Imports

In [0]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

## colab kaggle init

In [0]:
from google.colab import files
files.upload()

In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

kaggle.json


In [8]:
!pip install -q kaggle
!pip install -q kaggle-cli

[K     |████████████████████████████████| 81kB 3.2MB/s 
[K     |████████████████████████████████| 5.3MB 8.4MB/s 
[K     |████████████████████████████████| 112kB 59.9MB/s 
[K     |████████████████████████████████| 102kB 35.3MB/s 
[K     |████████████████████████████████| 51kB 22.7MB/s 
[?25h  Building wheel for kaggle-cli (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [10]:
!kaggle competitions download -c ieee-fraud-detection

Downloading train_transaction.csv.zip to /content
 93% 49.0M/52.5M [00:00<00:00, 53.0MB/s]
100% 52.5M/52.5M [00:00<00:00, 97.4MB/s]
Downloading train_identity.csv.zip to /content
  0% 0.00/3.02M [00:00<?, ?B/s]
100% 3.02M/3.02M [00:00<00:00, 99.2MB/s]
Downloading test_transaction.csv.zip to /content
 97% 46.0M/47.3M [00:00<00:00, 82.5MB/s]
100% 47.3M/47.3M [00:00<00:00, 158MB/s] 
Downloading test_identity.csv.zip to /content
  0% 0.00/2.97M [00:00<?, ?B/s]
100% 2.97M/2.97M [00:00<00:00, 200MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.14M [00:00<?, ?B/s]
100% 1.14M/1.14M [00:00<00:00, 163MB/s]


In [14]:
!ls

kaggle.json		   test_identity.csv.zip     train_transaction.csv.zip
sample_data		   test_transaction.csv.zip
sample_submission.csv.zip  train_identity.csv.zip


In [16]:
!unzip \*.zip

Archive:  train_identity.csv.zip
  inflating: train_identity.csv      

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   

Archive:  test_transaction.csv.zip
  inflating: test_transaction.csv    

Archive:  test_identity.csv.zip
  inflating: test_identity.csv       

Archive:  train_transaction.csv.zip
  inflating: train_transaction.csv   

5 archives were successfully processed.


In [0]:
df_train_ident = pd.read_csv('train_identity.csv', index_col='TransactionID')
df_test_ident = pd.read_csv('test_identity.csv', index_col='TransactionID')

df_train_trans = pd.read_csv('train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('test_transaction.csv', index_col='TransactionID')

df_sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')

In [0]:
df_train = df_train_trans.merge(df_train_ident, how='left', left_index=True, right_index=True)
df_test = df_test_trans.merge(df_test_ident, how='left', left_index=True, right_index=True)


In [19]:
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 432)


## Preprocessing

In [0]:
# Copy label
y_train = df_train['isFraud'].copy()
del df_train_trans, df_train_ident, df_test_ident, df_test_trans

# Delete label
X_train = df_train.drop('isFraud', axis=1)

# Create X
X_test = df_test.copy()

# Delete train and test

del df_train, df_test

In [0]:
# Encode all non numerical values

for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  

## Reduce Memory

Kernel: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

In [0]:

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [23]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 1970.87 MB
Memory usage after optimization is: 547.14 MB
Decreased by 72.2%
Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 460.02 MB
Decreased by 72.5%


## Model

In [0]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.33, random_state=42)


In [0]:
# First without any kfold
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
y_preds = np.zeros(df_sample_submission.shape[0])
y_oof = np.zeros(X_train.shape[0])

clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method='auto'
)

clf.fit(X_tr, y_tr)
y_pred_train = clf.predict_proba(X_val)[:,1]


In [0]:
y_pred_train.shape

(194879,)

In [0]:
y_val.shape

(194879,)

In [0]:
print('ROC AUC {}'.format(roc_auc_score(y_val, y_pred_train)))

ROC AUC 0.9283440202051165


In [0]:
y_pred_train

array([0.01855968, 0.01234012, 0.00583749, ..., 0.04052692, 0.01094123,
       0.04641531], dtype=float32)

In [27]:
% time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
EPOCHS = 3
kf = KFold(n_splits = EPOCHS, shuffle = True)
y_preds = np.zeros(df_sample_submission.shape[0])
y_oof = np.zeros(X_train.shape[0])
i = 0
for tr_idx, val_idx in kf.split(X_train, y_train):
    i += 1
    print("Split {}".format(i))
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method='gpu_hist'
    )
    
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print('ROC AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
    
    y_preds+= clf.predict_proba(X_test)[:,1] / EPOCHS

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Split 1
ROC AUC 0.9631735781893863
Split 2
ROC AUC 0.9629258488568295
Split 3
ROC AUC 0.9631861721026433


In [30]:
y_preds[:10]

array([0.00294482, 0.00089532, 0.00268825, 0.00259867, 0.00119732,
       0.00432064, 0.0107135 , 0.01641461, 0.0002049 , 0.00641528])

In [0]:
X_test_preds = X_test.assign(isFraud = y_preds)

In [0]:
X_test_preds = X_test_preds[['isFraud']]

In [0]:
X_test_preds.to_csv('preds.csv')

In [47]:
pd.read_csv('preds.csv').head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.002945
1,3663550,0.000895
2,3663551,0.002688
3,3663552,0.002599
4,3663553,0.001197


In [48]:
!kaggle competitions submit -c ieee-fraud-detection -f preds.csv -m "First Baseline model"

100% 14.1M/14.1M [00:00<00:00, 26.1MB/s]
Successfully submitted to IEEE-CIS Fraud Detection