In [4]:
PERCENT_MISSING_DROP = 60
CORRELATION_DROP = 90

In [5]:
import numpy as np 
import pandas as pd 
import gc
import sklearn
import pickle

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [6]:
folder_path = '../input/ieee-fraud-detection/'
print('Loading data...')

train_identity = pd.read_csv(f'{folder_path}train_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_identity!')

train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_transaction!')

test_identity = pd.read_csv(f'{folder_path}test_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_identity!')

test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_transaction!')

sub = pd.read_csv(f'{folder_path}sample_submission.csv')
print('\tSuccessfully loaded sample_submission!')
print('Data was successfully loaded!\n')

Loading data...
	Successfully loaded train_identity!
	Successfully loaded train_transaction!
	Successfully loaded test_identity!
	Successfully loaded test_transaction!
	Successfully loaded sample_submission!
Data was successfully loaded!



In [7]:
print('Merging data...')
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
print('Data was successfully merged!\n')
del train_identity, train_transaction, test_identity, test_transaction

Merging data...
Data was successfully merged!



In [8]:
print("fraudulent transaction : ", len(train.loc[train.isFraud == 1])*100/len(train))

fraudulent transaction :  3.499000914417313


In [9]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            mean = props[col].mean()
            
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mean, inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            else:
                props[col] = props[col].astype(np.float32)
            
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [10]:
def cat_to_num(table):
    c = (table.dtypes == 'object')
    n = (table.dtypes != 'object')
    cat_id_cols = list(c[c].index)
    num_id_cols = list(n[n].index)
    
    print(cat_id_cols, "\n")
    print("number categorical identity features: ", len(cat_id_cols), "\n\n")
    print(num_id_cols, "\n")
    print("number numerical identity features: \n", len(num_id_cols))
    
    print("converting cat. in num. feature")
    numerical = pd.get_dummies(table[cat_id_cols], dummy_na = False)
    table = pd.concat([table, numerical], axis = 1)
    print("convertion done.")
    
    for elem in cat_id_cols:
        del table[elem]
        
    return table

In [11]:
deleted_column = []

def format(table):

    percent_missing = table.isnull().sum() * 100 / len(table)
    lst = list(percent_missing.index)
    lst2 = []
        
    for elem in range(len(lst)):
        if percent_missing.iloc[elem] > PERCENT_MISSING_DROP:
            lst2.append(lst[elem])

    for elem in lst2:
        deleted_column.append(elem)
        del table[elem]
        
    print("supression done")
 
    print("calculating correlation matrix ...")
    #correlation matrix 
    corr_matrix = table.corr().abs()
    print("calculation done.")
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > (CORRELATION_DROP/100))]
    print("column to drop : ", to_drop)
    
    #del highly coralated column
    for elem in to_drop:
        deleted_column.append(elem)
        del table[elem]
    
    table.reset_index(drop=True, inplace=True)
    
    return table, deleted_column

In [12]:
train.info()
train.sample(frac=1)
test.sample(frac=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 2987000 to 3577539
Columns: 433 entries, isFraud to DeviceInfo
dtypes: float64(399), int64(3), object(31)
memory usage: 1.9+ GB


Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3710948,19998889,44.000,W,15651,417.0,150.0,visa,226.0,debit,330.0,...,,,,,,,,,,
3700768,19673034,35.351,C,1320,555.0,117.0,visa,137.0,credit,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G950F Build/R16NW
4041735,31283864,466.020,W,9500,321.0,150.0,visa,226.0,debit,272.0,...,,,,,,,,,,
4118913,33219452,25.000,H,13143,170.0,150.0,mastercard,102.0,credit,123.0,...,mobile safari 12.0,32.0,1334x750,match_status:2,T,F,F,T,mobile,iOS Device
4134255,33457600,14.386,C,16136,204.0,185.0,visa,138.0,debit,,...,mobile safari 12.0,,,,F,F,F,F,mobile,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3967006,28918916,30.950,W,15813,251.0,150.0,visa,226.0,debit,441.0,...,,,,,,,,,,
4014243,30401426,54.000,W,10908,582.0,150.0,visa,226.0,debit,476.0,...,,,,,,,,,,
3720858,20360459,57.950,W,11794,555.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
3769738,21994939,72.000,W,1939,360.0,150.0,visa,166.0,debit,269.0,...,,,,,,,,,,


In [13]:
train.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [14]:
y_train = train['isFraud']
del train['isFraud']
x_train, deleted_column = format(train)
del train
x_train = cat_to_num(x_train)
x_train.head()

supression done
calculating correlation matrix ...
calculation done.
column to drop :  ['C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C14', 'D2', 'V5', 'V11', 'V13', 'V16', 'V18', 'V20', 'V21', 'V22', 'V28', 'V30', 'V31', 'V32', 'V33', 'V34', 'V36', 'V40', 'V42', 'V43', 'V45', 'V48', 'V49', 'V50', 'V51', 'V52', 'V54', 'V57', 'V58', 'V59', 'V60', 'V63', 'V64', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V76', 'V79', 'V80', 'V81', 'V84', 'V85', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V96', 'V97', 'V101', 'V102', 'V103', 'V105', 'V106', 'V113', 'V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V137', 'V279', 'V280', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V301', 'V302', 'V304', 'V306', 'V307', 'V308', 'V309', 'V315', 'V316', 'V317', 'V318', 'V321']
['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'] 

number categorical identity features:  13 


['TransactionDT', 'TransactionAmt', 'card1

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,...,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,1.0,...,1,0,0,1,0,0,0,0,0,0
1,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,1.0,...,0,1,0,1,0,0,0,0,0,0
2,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,1.0,...,1,0,1,0,1,0,1,0,1,0
3,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,2.0,...,0,1,1,0,0,0,0,0,0,0
4,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,1.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
for elem in deleted_column:
    if elem.startswith('id'):
        num = elem.replace('id_','')
        del test['id-'+num]
    else:
        del test[elem]
        
test = cat_to_num(test)
test.head()

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'] 

number categorical identity features:  13 


['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'C1', 'C3', 'C5', 'C13', 'D1', 'D3', 'D4', 'D5', 'D10', 'D11', 'D15', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V9', 'V10', 'V12', 'V14', 'V15', 'V17', 'V19', 'V23', 'V24', 'V25', 'V26', 'V27', 'V29', 'V35', 'V37', 'V38', 'V39', 'V41', 'V44', 'V46', 'V47', 'V53', 'V55', 'V56', 'V61', 'V62', 'V65', 'V66', 'V67', 'V75', 'V77', 'V78', 'V82', 'V83', 'V86', 'V87', 'V95', 'V98', 'V99', 'V100', 'V104', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V129', 'V130', 'V131', 'V135', 'V136', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V300', 'V303', 'V305', 'V310', 'V311', 'V312', 'V313', 'V314', 'V319', 'V320']

Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,...,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.95,10409,111.0,150.0,226.0,170.0,87.0,1.0,6.0,...,0,0,1,0,0,1,0,1,0,1
3663550,18403263,49.0,4272,111.0,150.0,226.0,299.0,87.0,4.0,3.0,...,0,0,1,0,0,0,0,0,0,0
3663551,18403310,171.0,4476,574.0,150.0,226.0,472.0,87.0,2635.0,2.0,...,1,0,1,0,1,0,1,0,1,0
3663552,18403310,284.95,10989,360.0,150.0,166.0,205.0,87.0,17.0,5.0,...,0,0,0,1,0,0,0,0,0,0
3663553,18403317,67.95,18018,452.0,150.0,117.0,264.0,87.0,6.0,6.0,...,0,0,1,0,1,0,0,1,0,1


In [16]:
import datetime

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

x_train['TransactionDT'] = x_train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
x_train['dow'] = x_train['TransactionDT'].dt.dayofweek
x_train['hour'] = x_train['TransactionDT'].dt.hour
x_train['day'] = x_train['TransactionDT'].dt.day

test['TransactionDT'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
test['dow'] = test['TransactionDT'].dt.dayofweek
test['hour'] = test['TransactionDT'].dt.hour
test['day'] = test['TransactionDT'].dt.day

In [17]:
x_train['TransactionAmt'].round()
test['TransactionAmt'].round()

TransactionID
3663549     32.0
3663550     49.0
3663551    171.0
3663552    285.0
3663553     68.0
           ...  
4170235     95.0
4170236     12.0
4170237     49.0
4170238    202.0
4170239     24.0
Name: TransactionAmt, Length: 506691, dtype: float64

In [18]:
del x_train['TransactionDT']
del test['TransactionDT']

In [19]:
x_train, na = reduce_mem_usage(x_train)
test, na = reduce_mem_usage(test)

Memory usage of properties dataframe is : 564.872501373291  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  295.10792541503906  MB
This is  52.24328050977642 % of the initial size
Memory usage of properties dataframe is : 488.5335931777954  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  228.07898712158203  MB
This is  46.6864490603363 % of the initial size


In [20]:
y_train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [28]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=400, min_samples_leaf=20, n_jobs=-1, class_weight={0:0.5, 1:14}, verbose=1, max_features=0.5).fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 34.9min finished


In [29]:
prediction = pd.DataFrame(rfc.predict_proba(test))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.3s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   19.3s finished


In [30]:
del prediction[0]

In [31]:
submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
submission['isFraud'] = prediction
submission.to_csv('submission.csv', index=False)

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_features': [0.2, 0.3, 0.4, 0.5]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)
CV_rfc.fit(x_train, y_train)
print(CV_rfc.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  9.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    7.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  9.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | e

{'max_features': 0.5}


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 35.6min finished
