In [0]:
# %% [code]
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import gc,sys
import re
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
pd.options.display.float_format = '{:,.3f}'.format
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# %% [code]
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
print(train_identity.shape)
print(train_identity.head())

# %% [code]
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
print(train_transaction.shape)
print(train_transaction.head())

# %% [code]
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

# %% [code]
import matplotlib.pyplot as plt
%matplotlib inline
train_transaction.isFraud.value_counts(normalize=True).plot.bar()

# %% [code]
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# %% [code]
train_identity = reduce_mem_usage(train_identity)
train_transaction = reduce_mem_usage(train_transaction)
test_identity = reduce_mem_usage(test_identity)
test_transaction = reduce_mem_usage(test_transaction)

# %% [code]
train_transaction['trn_day'] = train_transaction['TransactionDT']//(24*60*60)
train_transaction['trn_week'] = train_transaction['trn_day']//7

# %% [code]
train_transaction.groupby('trn_day')['isFraud'].mean().plot.line(figsize=(15,4))


# %% [code]
train_transaction.groupby('trn_week')['isFraud'].mean().plot.line(figsize=(15,4))

# %% [code]
import datetime as dt
start_date = '2017-12-01'
startdate = dt.datetime.strptime(start_date, "%Y-%m-%d")

train_transaction['Date'] = train_transaction['TransactionDT'].apply(lambda x: (startdate + dt.timedelta(seconds=x)))
train_transaction['_year_month'] = train_transaction['Date'].dt.year.astype(str) + '/' + train_transaction['Date'].dt.month.astype(str)
train_transaction['_weekday'] = train_transaction['Date'].dt.dayofweek
train_transaction['_hour'] = train_transaction['Date'].dt.hour
train_transaction['_day'] = train_transaction['Date'].dt.day

fig,ax = plt.subplots(4, 1, figsize=(16,16))

train_transaction.groupby('_weekday')['isFraud'].mean().plot.bar(ax=ax[0])
train_transaction.groupby('_hour')['isFraud'].mean().plot.bar(ax=ax[1])
train_transaction.groupby('_day')['isFraud'].mean().plot.bar(ax=ax[2])
train_transaction.groupby('_year_month')['isFraud'].mean().plot.bar(ax=ax[3])


# %% [code]
train_transaction['weekday_hour'] = train_transaction['_weekday'].astype(str)+'_'+train_transaction['_hour'].astype(str)
train_transaction.groupby('weekday_hour')['isFraud'].mean().plot.line(figsize=(16,4))

# %% [code]
train_transaction['amount_qcut10'] = pd.qcut(train_transaction['TransactionAmt'],10)
print(train_transaction.groupby('amount_qcut10')['isFraud'].mean())

# %% [code]
train_id_trn = pd.merge(train_identity, train_transaction[['isFraud','TransactionID']])
train_id_fraud = train_id_trn[train_id_trn['isFraud']==1]
train_id_notfraud = train_id_trn[train_id_trn['isFraud']==0]
print(train_id_fraud.shape,train_id_notfraud.shape)

# %% [code]
def plot1(col):
    plt.hist([train_id_fraud[col],train_id_notfraud[col]],color=['green','blue'])
plot1('id_01')

# %% [code]
plot1('id_02')

# %% [code]
plot1('id_07')

# %% [code]
def plot2(col):
    #colors = tuple(np.where(train_id_trn["isFraud"]=1, 'g', 'b'))
    train_id_trn.groupby(['isFraud'])[col].value_counts(normalize=True).plot.bar()
plot2('id_15')

# %% [code]
plot2('id_16')

# %% [code]
plot1('id_17')

# %% [code]
plot1('id_19')

# %% [code]
plot1('id_20')

# %% [code]
plot2('id_23')

# %% [code]
plot1('id_26')

# %% [code]
plot2('id_28')

# %% [code]
plot2('id_29')

# %% [code]
#plot2('id_31')
#train_id_trn.groupby(['isFraud'])['id_31'].value_counts(normalize=True).sort_values(ascending=False).plot.bar(figsize=(15,5))
train_id_fraud.groupby(['isFraud'])['id_31'].value_counts(normalize=True).sort_values(ascending=False).plot.bar(figsize=(20,5))

# %% [code]
plot1('id_32')

# %% [code]
train_id_fraud.groupby(['isFraud'])['id_33'].value_counts(normalize=True).sort_values(ascending=False).plot.bar(figsize=(20,5))

# %% [code]
plot2('id_34')

# %% [code]
plot2('id_35')

# %% [code]
plot2('id_38')

# %% [code]
plot2('DeviceType')

# %% [code]
train_id_fraud.groupby(['isFraud'])['DeviceInfo'].value_counts(normalize=True)[:20].sort_values(ascending=False).plot.bar(figsize=(20,5))

# %% [code]
ccols = [f'C{i}' for i in range(1,15)]
dcols = [f'D{i}' for i in range(1,16)]
mcols = [f'M{i}' for i in range(1,10)]
vcols = [f'V{i}' for i in range(1,340)]

# %% [code]
train_trn_f0 = train_transaction[train_transaction['isFraud'] == 0]
train_trn_f1 = train_transaction[train_transaction['isFraud'] == 1]
print(train_trn_f0.shape, train_trn_f1.shape)

def plotTrnHistByFraud(col, bins=20):
    with np.errstate(invalid='ignore'):
        plt.figure(figsize=(8,3))
        plt.hist([train_trn_f0[col], train_trn_f1[col]], bins=bins, density=True, color=['royalblue', 'orange'])

def plotTrnLogHistByFraud(col, bins=20):
    with np.errstate(invalid='ignore'):
        plt.figure(figsize=(8,3))
        plt.hist([np.log1p(train_trn_f0[col]), np.log1p(train_trn_f1[col])], bins=bins, density=True, color=['royalblue', 'orange'])
        
def plotTrnCategoryRateBar(col, topN=np.nan):
    a, b = train_trn_f0, train_trn_f1
    if topN == topN: # isNotNan
        vals = b[col].value_counts(normalize=True).to_frame().iloc[:topN,0]
        subA = a.loc[a[col].isin(vals.index.values), col]
        df = pd.DataFrame({'normal':subA.value_counts(normalize=True), 'fraud':vals})
    else:
        df = pd.DataFrame({'normal':a[col].value_counts(normalize=True), 'fraud':b[col].value_counts(normalize=True)})
    df.sort_values('fraud', ascending=False).plot.bar(figsize=(8,3))


# %% [code]
def appendLagDT(df):
    df = df.assign(_date_lag = df['TransactionDT'] - df.groupby(['card1','card2'])['TransactionDT'].shift(1))
    return df

train_transaction = appendLagDT(train_transaction)
train_trn_f0 = train_transaction[train_transaction['isFraud'] == 0]
train_trn_f1 = train_transaction[train_transaction['isFraud'] == 1]

# %% [code]
pd.concat([train_trn_f0['_date_lag'].describe(), 
           train_trn_f1['_date_lag'].describe()], axis=1)

# %% [code]
plotTrnLogHistByFraud('_date_lag')

# %% [code]
plotTrnHistByFraud('TransactionAmt')
plotTrnLogHistByFraud('TransactionAmt')

# %% [code]
print("Normal")
print(train_trn_f0['TransactionAmt'].describe())
print("Fraud")
print(train_trn_f1['TransactionAmt'].describe())

# %% [code]
def appendLagAmt(df):
    df = df.assign(_amt_lag = df['TransactionAmt'] - df.groupby(['card1','card2'])['TransactionAmt'].shift(1))
    df['_amt_lag_sig'] = df['_amt_lag'].apply(lambda x: '0' if np.isnan(x) else '+' if x >=0 else '-')
    return df

train_transaction = appendLagAmt(train_transaction)
train_trn_f0 = train_transaction[train_transaction['isFraud'] == 0]
train_trn_f1 = train_transaction[train_transaction['isFraud'] == 1]

# %% [code]
plotTrnHistByFraud('_amt_lag')
plotTrnCategoryRateBar('_amt_lag_sig')

# %% [code]
plotTrnCategoryRateBar('ProductCD')

# %% [code]

train_transaction['_amount_max_ProductCD'] = train_transaction.groupby(['ProductCD'])['TransactionAmt'].transform('max')
train_transaction[['ProductCD','_amount_max_ProductCD']].drop_duplicates().sort_values(by='_amount_max_ProductCD', ascending=False)

# %% [code]
plotTrnCategoryRateBar('card1', 15)
plotTrnHistByFraud('card1', bins=30)

# %% [code]
plotTrnCategoryRateBar('card2', 15)
plotTrnHistByFraud('card2', bins=30)

# %% [code]
plotTrnCategoryRateBar('card3', 10)

# %% [code]
plotTrnCategoryRateBar('card4')

# %% [code]
plotTrnCategoryRateBar('card5', 10)

# %% [code]
plotTrnCategoryRateBar('card6')

# %% [code]
print(len(train_transaction))
print(train_transaction['card1'].nunique(), train_transaction['card2'].nunique(), train_transaction['card3'].nunique(), train_transaction['card5'].nunique())

train_transaction['card_n'] = (train_transaction['card1'].astype(str) + '_' + train_transaction['card2'].astype(str) \
       + '_' + train_transaction['card3'].astype(str) + '_' + train_transaction['card5'].astype(str))
print('unique cards:', train_transaction['card_n'].nunique())

# %% [code]
train_transaction['card_n'].value_counts().sort_values(ascending=False)[:40].plot.bar(figsize=(15,3))

# %% [code]
train_transaction.groupby(['card_n'])['isFraud'].mean().sort_values(ascending=False)

# %% [code]
plotTrnCategoryRateBar('addr1', 20)
plotTrnHistByFraud('addr1', bins=30)

# %% [code]
plotTrnCategoryRateBar('dist1', 20)
plotTrnCategoryRateBar('dist2', 20)

# %% [code]
plotTrnCategoryRateBar('P_emaildomain',10)
plotTrnCategoryRateBar('R_emaildomain',10)

# %% [code]
train_transaction['P_emaildomain'].cat.add_categories('unknown').fillna('unknown',inplace=True)
train_transaction['R_emaildomain'].cat.add_categories('unknown').fillna('unknown',inplace=True)


# %% [code]
train_trn_f1['P_emaildomain_prefix'] = train_trn_f1['P_emaildomain'].cat.add_categories('unknown').fillna('unknown').apply(lambda x: x.split('.')[0])
pd.crosstab(train_trn_f1['P_emaildomain_prefix'], train_trn_f1['ProductCD']).T

# %% [code]
train_transaction['P_emaildomain_prefix'] = train_transaction['P_emaildomain'].apply(lambda x: x.split('.')[0])
ct = pd.crosstab(train_transaction['P_emaildomain_prefix'], train_transaction['ProductCD'])
ct = ct.sort_values(by='W')[-15:]
ct.plot.barh(stacked=True, figsize=(12,4))

# %% [code]
for i in range(1,15):
    plotTrnCategoryRateBar(f'C{i}',10)

# %% [code]
train_transaction[ccols].describe().loc[['count','mean','std','min','max']]

# %% [code]
plt.figure(figsize=(10,5))

corr = train_transaction[['isFraud'] + ccols].corr()
sns.heatmap(corr, annot=True, fmt='.2f')

# %% [code]
for i in range(1,16):
    plotTrnCategoryRateBar(f'D{i}',10)

# %% [code]
train_transaction[dcols].describe().loc[['count','mean','std','min','max']]

# %% [code]
plt.figure(figsize=(12,4))

plt.scatter(train_trn_f0['TransactionDT'], train_trn_f0['D1'], s=2)
plt.scatter(train_trn_f1['TransactionDT'], train_trn_f1['D1'], s=2, c='r')


# %% [code]
plt.scatter(test_transaction['TransactionDT'], test_transaction['D1'], s=2, c='g')

# %% [code]
plt.figure(figsize=(12,4))

plt.scatter(train_trn_f0['TransactionDT'], train_trn_f0['D15'], s=2)
plt.scatter(train_trn_f1['TransactionDT'], train_trn_f1['D15'], s=2, c='r')
plt.scatter(test_transaction['TransactionDT'], test_transaction['D15'], s=2, c='g')

# %% [code]
plt.figure(figsize=(10,5))

corr = train_transaction[['isFraud'] + dcols].corr()
sns.heatmap(corr, annot=True, fmt='.2f')

# %% [code]
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
train_transaction.loc[train_transaction['isFraud']==0, dcols].isnull().sum(axis=1).to_frame().hist(ax=ax[0], bins=20)
train_transaction.loc[train_transaction['isFraud']==1, dcols].isnull().sum(axis=1).to_frame().hist(ax=ax[1], bins=20)

# %% [code]
plotTrnCategoryRateBar('M1')
plotTrnCategoryRateBar('M2')
plotTrnCategoryRateBar('M3')
plotTrnCategoryRateBar('M4')

# %% [code]
plotTrnCategoryRateBar('M5')
plotTrnCategoryRateBar('M6')
plotTrnCategoryRateBar('M7')
plotTrnCategoryRateBar('M8')
plotTrnCategoryRateBar('M9')

# %% [code]
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
train_transaction.loc[train_transaction['isFraud']==0, vcols].isnull().sum(axis=1).to_frame().hist(ax=ax[0], bins=20)
train_transaction.loc[train_transaction['isFraud']==1, vcols].isnull().sum(axis=1).to_frame().hist(ax=ax[1], bins=20)

# %% [code]
train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_trn = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
test_trn = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

id_cols = list(train_id.columns.values)
trn_cols = list(train_trn.drop('isFraud', axis=1).columns.values)

X_train = pd.merge(train_trn[trn_cols + ['isFraud']], train_id[id_cols], how='left')
X_train = reduce_mem_usage(X_train)
X_test = pd.merge(test_trn[trn_cols], test_id[id_cols], how='left')
X_test = reduce_mem_usage(X_test)

X_train_id = X_train.pop('TransactionID')
X_test_id = X_test.pop('TransactionID')
del train_id,train_trn,test_id,test_trn

all_data = X_train.append(X_test, sort=False).reset_index(drop=True)

# %% [code]
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
vcols=[f'V{i}' for i in range(1,340)]

sc = MinMaxScaler()

pca = PCA(n_components=2) #0.99
vcol_pca = pca.fit_transform(sc.fit_transform(all_data[vcols].fillna(-1)))

all_data['_vcol_pca0'] = vcol_pca[:,0]
all_data['_vcol_pca1'] = vcol_pca[:,1]
all_data['_vcol_nulls'] = all_data[vcols].isnull().sum(axis=1)

all_data.drop(vcols, axis=1, inplace=True)

# %% [code]
import datetime

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
all_data['Date'] = all_data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
all_data['_weekday'] = all_data['Date'].dt.dayofweek
all_data['_hour'] = all_data['Date'].dt.hour
all_data['_day'] = all_data['Date'].dt.day

all_data['_weekday'] = all_data['_weekday'].astype(str)
all_data['_hour'] = all_data['_hour'].astype(str)
all_data['_weekday__hour'] = all_data['_weekday'] + all_data['_hour']

cnt_day = all_data['_day'].value_counts()
cnt_day = cnt_day / cnt_day.mean()
all_data['_count_rate'] = all_data['_day'].map(cnt_day.to_dict())

all_data.drop(['TransactionDT','Date','_day'], axis=1, inplace=True)

# %% [code]
all_data['_P_emaildomain__addr1'] = all_data['P_emaildomain'] + '__' + all_data['addr1'].astype(str)
all_data['_card1__card2'] = all_data['card1'].astype(str) + '__' + all_data['card2'].astype(str)
all_data['_card1__addr1'] = all_data['card1'].astype(str) + '__' + all_data['addr1'].astype(str)
all_data['_card2__addr1'] = all_data['card2'].astype(str) + '__' + all_data['addr1'].astype(str)
all_data['_card12__addr1'] = all_data['_card1__card2'] + '__' + all_data['addr1'].astype(str)
all_data['_card_all__addr1'] = all_data['_card1__card2'] + '__' + all_data['addr1'].astype(str)

# %% [code]
all_data['_amount_decimal'] = ((all_data['TransactionAmt'] - all_data['TransactionAmt'].astype(int)) * 1000).astype(int)
all_data['_amount_decimal_len'] = all_data['TransactionAmt'].apply(lambda x: len(re.sub('0+$', '', str(x)).split('.')[1]))
all_data['_amount_fraction'] = all_data['TransactionAmt'].apply(lambda x: float('0.'+re.sub('^[0-9]|\.|0+$', '', str(x))))
all_data[['TransactionAmt','_amount_decimal','_amount_decimal_len','_amount_fraction']].head(10)

# %% [code]
cols = ['ProductCD','card1','card2','card5','card6','P_emaildomain','_card_all__addr1']


# amount mean&std
for f in cols:
    all_data[f'_amount_mean_{f}'] = all_data['TransactionAmt'] / all_data.groupby([f])['TransactionAmt'].transform('mean')
    all_data[f'_amount_std_{f}'] = all_data['TransactionAmt'] / all_data.groupby([f])['TransactionAmt'].transform('std')
    all_data[f'_amount_pct_{f}'] = (all_data['TransactionAmt'] - all_data[f'_amount_mean_{f}']) / all_data[f'_amount_std_{f}']

# freq encoding
for f in cols:
    vc = all_data[f].value_counts(dropna=False)
    all_data[f'_count_{f}'] = all_data[f].map(vc)


# %% [code]
cat_cols = [f'id_{i}' for i in range(12,39)]
for i in cat_cols:
    if i in all_data.columns:
        all_data[i] = all_data[i].astype(str)
        all_data[i].fillna('unknown', inplace=True)

enc_cols = []
for i, t in all_data.loc[:, all_data.columns != 'isFraud'].dtypes.iteritems():
    if t == object:
        enc_cols.append(i)
        #df = pd.concat([df, pd.get_dummies(df[i].astype(str), prefix=i)], axis=1)
        #df.drop(i, axis=1, inplace=True)
        all_data[i] = pd.factorize(all_data[i])[0]
        #all_data[i] = all_data[i].astype('category')
print(enc_cols)

# %% [code]
all_data['isFraud'].isnull().sum()
X_train = all_data[all_data['isFraud'].notnull()]
X_test = all_data[all_data['isFraud'].isnull()].drop('isFraud',axis=1)
Y_train = X_train.pop('isFraud')

# %% [code]
print(X_train.shape,X_test.shape)

# %% [code]
%%time
from mlxtend.classifier import StackingClassifier
import lightgbm as lgb

params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 256,
        'verbose': 1,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.85
       }

oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

clf1 = lgb.LGBMClassifier(**params, n_estimators=3000)
#clf.fit(X_train, Y_train)
#oof_preds = clf.predict_proba(X_train, num_iteration=clf.best_iteration_)[:,1]
#sub_preds = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:,1]

# %% [code]
params2={'learning_rate': 0.005,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 512,
        'verbose': 1,
        'random_state': 41,
        'bagging_fraction': 1,
        'feature_fraction': 0.85
       }
clf2=lgb.LGBMClassifier(**params2, n_estimators=4000)

# %% [code]
params3={'learning_rate': 0.007,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 384,
        'verbose': 1,
        'random_state': 41,
        'bagging_fraction': 1,
        'feature_fraction': 0.85
       }
clf3=lgb.LGBMClassifier(**params3, n_estimators=3500)

# %% [code]
sclf=StackingClassifier(classifiers=[clf1,clf2],use_probas=True,meta_classifier=clf3)
sclf.fit(X_train, Y_train)
oof_preds = sclf.predict_proba(X_train, num_iteration=sclf.best_iteration_)[:,1]
sub_preds = sclf.predict_proba(X_test, num_iteration=sclf.best_iteration_)[:,1]

# %% [code]
submission = pd.DataFrame()
submission['TransactionID'] = X_test_id
submission['isFraud'] = sub_preds
submission.to_csv('submission1.csv', index=False)