data source: https://www.kaggle.com/competitions/ieee-fraud-detection/data

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Transaction

In [7]:
folder = "../data/raw/ieee-fraud-detection/"
train_transaction_df = pd.read_csv(f"{folder}train_transaction.csv")
train_transaction_df.head()
#train_transaction_df.dtypes
len(train_transaction_df)

590540

### Transactions table


- **TransactionDT**: timedelta from a given reference datetime (not an actual timestamp)
- **TransactionAMT**: transaction payment amount in USD
- **ProductCD**: product code, the product for each transaction
- **card1 - card6**: payment card information, such as card type, card category, issue bank, country, etc.
- **addr**: address
- **dist**: distance
- **P_ and (R__) emaildomain**: purchaser and recipient email domain
- **C1-C14**: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
- **D1-D15**: timedelta, such as days between previous transaction, etc.
- **M1-M9**: match, such as names on card and address, etc.
- **Vxxx**: Vesta engineered rich features, including ranking, counting, and other entity relations.


**Categorical Features:**
- ProductCD
- card1 - card6
- addr1, addr2
- P_emaildomain
- R_emaildomain
- M1 - M9

In [8]:
train_transaction_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Show all column names and their dtypes
print(list(train_transaction_df.columns))
print(train_transaction_df.dtypes)

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

In [17]:
card_df = train_transaction_df.filter(like='card')
card_df

Unnamed: 0,card1,card2,card3,card4,card5,card6
0,13926,,150.0,discover,142.0,credit
1,2755,404.0,150.0,mastercard,102.0,credit
2,4663,490.0,150.0,visa,166.0,debit
3,18132,567.0,150.0,mastercard,117.0,debit
4,4497,514.0,150.0,mastercard,102.0,credit
...,...,...,...,...,...,...
590535,6550,,150.0,visa,226.0,debit
590536,10444,225.0,150.0,mastercard,224.0,debit
590537,12037,595.0,150.0,mastercard,224.0,debit
590538,7826,481.0,150.0,mastercard,224.0,debit


In [23]:
"""
both addresses are for purchaser
addr1 as billing region
addr2 as billing country
"""
addr_df = train_transaction_df[['addr1', 'addr2']]
addr_df

Unnamed: 0,addr1,addr2
0,315.0,87.0
1,325.0,87.0
2,330.0,87.0
3,476.0,87.0
4,420.0,87.0
...,...,...
590535,272.0,87.0
590536,204.0,87.0
590537,231.0,87.0
590538,387.0,87.0


In [24]:
dist_df = train_transaction_df[['dist1', 'dist2']]
dist_df

Unnamed: 0,dist1,dist2
0,19.0,
1,,
2,287.0,
3,,
4,,
...,...,...
590535,48.0,
590536,,
590537,,
590538,3.0,


In [22]:
"""distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc."""
d_df = train_transaction_df[[f'D{i}' for i in range(1, 16)]]
d_df 

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0
1,0.0,,,0.0,,,,,,0.0,,,,,0.0
2,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0
3,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0
4,0.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,29.0,29.0,30.0,,,,,,,56.0,56.0,,,,56.0
590536,0.0,,,0.0,,,,,,0.0,0.0,,,,0.0
590537,0.0,,,0.0,,,,,,0.0,0.0,,,,0.0
590538,22.0,22.0,0.0,22.0,0.0,,,,,22.0,22.0,,,,22.0


In [25]:
email_df = train_transaction_df[['P_emaildomain', 'R_emaildomain']]
email_df 

Unnamed: 0,P_emaildomain,R_emaildomain
0,,
1,gmail.com,
2,outlook.com,
3,yahoo.com,
4,gmail.com,
...,...,...
590535,,
590536,gmail.com,
590537,gmail.com,
590538,aol.com,
