### Data Description
The dataset we use is a synthetic dataset from  IEEE-CIS dataset (https://www.kaggle.com/competitions/ieee-fraud-detection/data) created to mimic typical examples of financial transactions dataset that many companies have. The dataset consists of two tables:

* **Transactions** table: Records transactions and metadata about transactions between two users. Examples of columns include the product code for the transaction and features on the card used for the transaction, and a column indicating whether the corresponded transcation is fraud or not.
* **Identity** table: Contains information about the identity users performing transactions. Examples of columns here include the device type and device ids used.

The two tables can be joined together using the unique identified-key column **TransactionID**.

In [65]:
from faker import Faker
import datetime
import itertools
import numpy as np
import pandas as pd

In [32]:
Faker.seed(0)
np.random.seed(0)

NUM_UNIQUE_CCS = 40*10**3
START_TRANS_DATE = datetime.datetime(2012, 1, 15)
END_TRANS_DATE = datetime.datetime(2012, 3, 15)

num_unique_ccs=NUM_UNIQUE_CCS
start_trans_date=START_TRANS_DATE
end_trans_date=END_TRANS_DATE

In [33]:
fake = Faker() #en_US
cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)]
cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)]
cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)]

In [73]:
# num_unique_ccs=5
# cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
# cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
# num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
# data = {
#     'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
#     'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
# }
# print(cc_nums,'\n', cc_types,'\n', num_trans_per_cc)
# pd.DataFrame(data)

['5333409934527854', '213133594532469', '6011517270188698', '3598220511127041', '373929579836866'] 
 ['JCB 16 digit', 'VISA 16 digit', 'VISA 16 digit', 'Diners Club / Carte Blanche', 'Discover'] 
 [6 8 1 5 5]


Unnamed: 0,card_no,card_type
0,5333409934527854,JCB 16 digit
1,5333409934527854,JCB 16 digit
2,5333409934527854,JCB 16 digit
3,5333409934527854,JCB 16 digit
4,5333409934527854,JCB 16 digit
5,5333409934527854,JCB 16 digit
6,213133594532469,VISA 16 digit
7,213133594532469,VISA 16 digit
8,213133594532469,VISA 16 digit
9,213133594532469,VISA 16 digit


In [78]:
#[fake.phone_number()for _ in range(num_unique_ccs)]
# the notation in the end: x#### indicate a specific branch or department used within organizations or institutions.

['912-565-2636',
 '001-896-825-6972x9902',
 '(252)554-6472x5763',
 '+1-817-329-3451x6372',
 '612-860-6914x63297']

In [82]:
data = {
    'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))],
    'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date) 
                      for _ in range(sum(num_trans_per_cc))],
    'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
    'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
    'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))],
    'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)),
    'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32),
}


In [84]:
transactions = pd.DataFrame(data)
transactions.head(20)

Unnamed: 0,TransactionID,TransactionDT,card_no,card_type,email_domain,ProductCD,TransactionAmt
0,0d6305fa-d1d2-4afc-98c9-07058cbe1b46,2012-01-17 01:14:56,5333409934527854,JCB 16 digit,cuevas.com,Y,1318
1,b965d446-289c-4999-8643-365e12e042d4,2012-02-06 01:36:35,5333409934527854,JCB 16 digit,anderson.info,L,49
2,11e9d362-50bf-4d6d-99ba-f65c2fcf5ebd,2012-03-03 02:13:00,5333409934527854,JCB 16 digit,holloway.com,Y,832
3,418f9ec0-d465-4e28-a81f-64804af5717d,2012-02-26 09:22:17,5333409934527854,JCB 16 digit,hopkins.com,AB,573
4,a27dba3f-b9aa-411a-a6ab-857f04895bed,2012-02-26 15:33:39,5333409934527854,JCB 16 digit,mcdaniel.com,Y,1470
5,c68af0f9-3dea-4512-bc0e-f0d38bf43fb4,2012-03-06 04:58:41,5333409934527854,JCB 16 digit,hotmail.com,45,1073
6,3729233a-7034-4c3c-b4e4-7793e29ab3a1,2012-02-03 09:02:35,213133594532469,VISA 16 digit,barker.com,Y,3043
7,b06c9da4-96ad-4eb1-9b97-781f41da33c2,2012-01-25 08:54:18,213133594532469,VISA 16 digit,gonzales.info,T,1127
8,558e0d42-cbaf-4aa7-922e-c5f1344bdcf4,2012-02-13 01:31:40,213133594532469,VISA 16 digit,wong.com,AB,3380
9,efab6866-9641-4470-9a6f-f833d62d9172,2012-01-20 14:54:32,213133594532469,VISA 16 digit,hotmail.com,45,2060


In [85]:
transactions = pd.DataFrame(data).sort_values(by=['TransactionDT'])
transactions.head(5)

Unnamed: 0,TransactionID,TransactionDT,card_no,card_type,email_domain,ProductCD,TransactionAmt
11,145d7027-f3be-4112-8164-996ab12117ad,2012-01-16 04:00:42,213133594532469,VISA 16 digit,powers.com,L,655
0,0d6305fa-d1d2-4afc-98c9-07058cbe1b46,2012-01-17 01:14:56,5333409934527854,JCB 16 digit,cuevas.com,Y,1318
13,f95fee70-9721-47ad-bfbd-2d337385c57d,2012-01-19 07:37:12,213133594532469,VISA 16 digit,yahoo.com,45,322
20,dec7f416-4010-4382-9ee2-53d8b584ab5c,2012-01-20 03:24:42,373929579836866,Discover,yahoo.com,Y,112
9,efab6866-9641-4470-9a6f-f833d62d9172,2012-01-20 14:54:32,213133594532469,VISA 16 digit,hotmail.com,45,2060


In [46]:
identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False)
id_data = {
    'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])),
    'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])),
    'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])),
}
identity = pd.DataFrame(id_data)
identity["TransactionID"] = transactions.TransactionID
assert identity.shape[0] == transactions.shape[0]

identity = identity.loc[identity_transactions_idx]
identity.reset_index(drop=True, inplace=True)
identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]]

In [47]:
identity.head(5)

Unnamed: 0,TransactionID,IpAddress,PhoneNo,DeviceID
0,71d58b45-4f6a-4605-badb-969c924544a0,209.188.252.150,+1-861-832-8029x51571,574553513526
1,dff26dab-c64a-440d-98aa-112a33b82848,121.64.77.189,(000)074-0237,8949188452208
2,240962d0-5940-4217-af69-05547d6802e2,66.137.103.97,(234)051-3687x220,5660104100156
3,e8a07810-cd3b-4836-9224-5f971e1edbc5,121.127.219.91,+1-500-135-9467x79371,642016005317
4,bcd2a3b1-adf9-4a8a-bdbb-56b0e25654fd,167.28.147.7,(950)578-6717x8951,1944467055799


In [52]:
full_identity_df = transactions_df.merge(identity_df, on='TransactionID', how='left')

# drop transcations time column as it is not useful for constructing graph.
#full_identity_df.drop(["TransactionDT"], axis=1, inplace=True)

In [53]:
full_identity_df.head(5)

Unnamed: 0,TransactionID,TransactionDT,card_no,card_type,email_domain,ProductCD,TransactionAmt,isFraud,IpAddress,PhoneNo,DeviceID
0,9c90c7e2-2600-4628-a868-179287eee169,2012-01-15 00:00:20,30545481171260,JCB 16 digit,smith-henry.info,T,1198,0,121.92.230.58,+1-038-395-3162x55186,7163249193818
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,2012-01-15 00:00:44,6549624810102543,VISA 13 digit,yahoo.com,Y,409,0,118.244.38.209,001-923-541-7445x63231,3807588936118
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,2012-01-15 00:02:18,4792410992636220,JCB 15 digit,hotmail.com,Y,1101,0,66.25.190.213,955-326-1614,9673128747953
3,5288f832-6673-473a-80f6-fdb98dd99278,2012-01-15 00:03:01,3573503852773765,VISA 16 digit,hotmail.com,Y,2228,0,42.93.10.194,+1-339-892-9034x139,9965894358908
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,2012-01-15 00:03:24,3597910599495184,Diners Club / Carte Blanche,gmail.com,L,2393,0,154.168.168.7,278.713.5654,7523612045351


### Signs of Credit Card Fraud for Merchants

#### Suspicious identity
1. Use of likely false information (for example, fake phone numbers and email addresses like asdkf12495@freemail.example.com).

2. Inconsistencies in customer details across multiple purchases (for example, using the same e-mail address but a different name for another payment).

3. If an IP address shows a purchase being made from different country than the card holder’s address the legitimacy of the transaction may be questionable. 

4. Same address, different cards: Someone uses 3 cards across 5 orders with the same billing and shipping address. This is a common fraud tactic in electronic equipment retailers.

5. One card, multiple shipping addresses: Someone buys a bunch of consecutive items to different addresses — especially if they are in different states and are to personal addresses instead of business addresses.

6. Same IP address, different cards: You’re a B2C and the same computer is ordering a series of products from you with different cards to the same address.

7. Multiple orders, similar card numbers: An order comes through where the first 12 digits are the same and the last four are switched just slightly.


#### Suspicious behavior:
1. Communication that doesn’t sounds quite right. Fraudsters often use a canned response that is sent to multiple sellers using common phrases. If any communication appears scripted, use a search engine (putting the short phrase in quotes) to see if it’s been used elsewhere (for example, this particular phrase has been used many times).

2. Unusually large orders (for example, multiples of the same item, only your most expensive merchandise, expensive items or total order amount that seems inconsistent with normal customer behavior).

3. Any requests to:
    * Split a large order into multiple payments across different cards that don’t share the same verified billing address information.   
    * Process a payment manually, either through the Dashboard or your store. Fraudsters may make this request in order to have the charge run with your local IP address instead of their own.   
    * Charge a card more than the required amount (known as an “overcharge”) and pay out a third-party (for example, driver, shipper or freight company) using a different payment method (for example, cash, money order).   
    * Charge a card and then provide a refund outside the card network (for example, check, wire transfer).


### The rules used for the fraud labels below:

* the product type is "45"-- 90% possibility it's fraud.
* the device id is ending with "16", "78", "23" -- 10%
* the email is not a common one:
    1. the payment of the order is over 3000 -- 80%
    2. specital card type
    3. some certain ip address

In [None]:
is_fraud = []
for idx, row in full_df.iterrows():
    card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"])

    if email in ["hotmail.com", "gmail.com", "yahoo.com"]:
        if product_type in ["45"]:
            is_fraud.append(int(np.random.uniform() < 0.9))
        else:
            if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")):
                is_fraud.append(int(np.random.uniform() < 0.1))
            else:
                is_fraud.append(int(np.random.uniform() < 0.05))
    else:
        if transcation_amount > 3000:
            is_fraud.append(int(np.random.uniform() < 0.8))
        else:
            if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires
                if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))): 
                    is_fraud.append(int(np.random.uniform() < 0.3))
                else:
                    if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")):
                        is_fraud.append(int(np.random.uniform() < 0.2))
                    else:
                        is_fraud.append(int(np.random.uniform() < 0.1))
            else:
                is_fraud.append(int(np.random.uniform() < 0.0001))
print("fraud ratio", sum(is_fraud)/ len(is_fraud))

transactions['isFraud'] = is_fraud

### DEF generating data

In [103]:
from faker import Faker
import datetime
import itertools
import numpy as np
import pandas as pd

Faker.seed(0)
np.random.seed(0)

NUM_UNIQUE_CCS = 40*10**3
START_TRANS_DATE = datetime.datetime(2012, 1, 15)
END_TRANS_DATE = datetime.datetime(2012, 3, 15)

def gen_fraud_data(num_unique_ccs=NUM_UNIQUE_CCS, start_trans_date=START_TRANS_DATE, end_trans_date=END_TRANS_DATE):
    fake = Faker()
    cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
    cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
    num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
    cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)]
    cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)]
    cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)]

    data = {
        'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))],
        'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date) 
                          for _ in range(sum(num_trans_per_cc))],
        'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
        'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
        'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))],
        'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)),
        'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32),
    }
    transactions = pd.DataFrame(data).sort_values(by=['TransactionDT'])
    
    # if you want to make the # of observations in the identity table less than that in the transactions table which may be more realistic in a practical scenario, change the size argument below.
    identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False)
    id_data = {
        'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])),
        'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])),
        'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])),
    }
    identity = pd.DataFrame(id_data)
    identity["TransactionID"] = transactions.TransactionID
    assert identity.shape[0] == transactions.shape[0]
    
    identity = identity.loc[identity_transactions_idx]
    identity.reset_index(drop=True, inplace=True)
    identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]]
    #identity = pd.DataFrame(id_data) this code seems to be repeated.
    
    
    # join two tables for the convenience of generating label column 'isFraud'
    full_two_df = transactions[["TransactionID", "card_no", "card_type", "email_domain", "ProductCD", "TransactionAmt"]].merge(identity, on='TransactionID', how='left')

    is_fraud = []
    for idx, row in full_two_df.iterrows():
        card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"])
        
        if email in ["hotmail.com", "gmail.com", "yahoo.com"]:
            if product_type in ["45"]:
                is_fraud.append(int(np.random.uniform() < 0.9))
            else:
                if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")):
                    is_fraud.append(int(np.random.uniform() < 0.1))
                else:
                    is_fraud.append(int(np.random.uniform() < 0.05))
        else:
            if transcation_amount > 3000:
                is_fraud.append(int(np.random.uniform() < 0.8))
            else:
                if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires
                    if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))): 
                        is_fraud.append(int(np.random.uniform() < 0.3))
                    else:
                        if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")):
                            is_fraud.append(int(np.random.uniform() < 0.2))
                        else:
                            is_fraud.append(int(np.random.uniform() < 0.1))
                else:
                    is_fraud.append(int(np.random.uniform() < 0.0001))
    print("fraud ratio", sum(is_fraud)/ len(is_fraud))
    
    transactions['isFraud'] = is_fraud
    return transactions, identity

if __name__ == '__main__':
    transaction, identity = gen_fraud_data()
    transaction.to_csv('transaction.csv', index=False)
    identity.to_csv('identity.csv', index=False)


fraud ratio 0.14167740465732165


In [104]:
import os
import pandas as pd

#raw_data_dir = "input_raw_data"
#transactions_df = pd.read_csv(os.path.join(raw_data_dir, "transaction.csv"))

transactions_df = pd.read_csv("transaction.csv")
identity_df = pd.read_csv("identity.csv")

In [105]:
full_identity_df = transactions_df.merge(identity_df, on='TransactionID', how='left')

# drop transcations time column as it is not useful for constructing graph.
#full_identity_df.drop(["TransactionDT"], axis=1, inplace=True)

# Re-arange the order of column names for better visualization
full_identity_df = full_identity_df[
    ["TransactionID", "TransactionDT", "card_no", "card_type", "email_domain", "IpAddress", "PhoneNo", "DeviceID", "ProductCD", "TransactionAmt", "isFraud"]
]
full_identity_df.head(5)

Unnamed: 0,TransactionID,TransactionDT,card_no,card_type,email_domain,IpAddress,PhoneNo,DeviceID,ProductCD,TransactionAmt,isFraud
0,9c90c7e2-2600-4628-a868-179287eee169,2012-01-15 00:00:20,30545481171260,JCB 16 digit,smith-henry.info,121.92.230.58,+1-038-395-3162x55186,7163249193818,T,1198,0
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,2012-01-15 00:00:44,6549624810102543,VISA 13 digit,yahoo.com,118.244.38.209,001-923-541-7445x63231,3807588936118,Y,409,0
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,2012-01-15 00:02:18,4792410992636220,JCB 15 digit,hotmail.com,66.25.190.213,955-326-1614,9673128747953,Y,1101,0
3,5288f832-6673-473a-80f6-fdb98dd99278,2012-01-15 00:03:01,3573503852773765,VISA 16 digit,hotmail.com,42.93.10.194,+1-339-892-9034x139,9965894358908,Y,2228,0
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,2012-01-15 00:03:24,3597910599495184,Diners Club / Carte Blanche,gmail.com,154.168.168.7,278.713.5654,7523612045351,L,2393,0


In [106]:
print('data shape',full_df.shape)
print("Percentage of fraud transactions for all data: {}".format(sum(full_df['isFraud'])/len(full_df)*100))

data shape (140467, 9)
Percentage of fraud transactions for all data: 17.889611083030175


In [86]:
import argparse
import logging
import os

import pandas as pd
import numpy as np
from itertools import combinations

In [87]:
def get_relations_and_edgelist(transactions_df, identity_df, transactions_id_cols, output_dir):
    # Get relations
    edge_types = transactions_id_cols.split(",") + list(identity_df.columns)
    logging.info("Found the following distinct relation types: {}".format(edge_types))
    id_cols = ['TransactionID'] + transactions_id_cols.split(",")
    full_identity_df = transactions_df[id_cols].merge(identity_df, on='TransactionID', how='left')
    logging.info("Shape of identity columns: {}".format(full_identity_df.shape))

    # extract edges
    edges = {}
    for etype in edge_types:
        edgelist = full_identity_df[['TransactionID', etype]].dropna()
        edgelist.to_csv(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype), index=False, header=True)
        logging.info("Wrote edgelist to: {}".format(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype)))
        edges[etype] = edgelist
    return edges


def create_homogeneous_edgelist(edges, output_dir):
    homogeneous_edges = []
    for etype, relations in edges.items():
        for edge_relation, frame in relations.groupby(etype):
            new_edges = [(a, b) for (a, b) in combinations(frame.TransactionID.values, 2)
                         if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
            homogeneous_edges.extend(new_edges)

    with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f:
        f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges))
    logging.info("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv')))


usage: ipykernel_launcher.py [-h] [--data-dir DATA_DIR] [--output-dir OUTPUT_DIR] [--transactions TRANSACTIONS]
                             [--identity IDENTITY] [--id-cols ID_COLS] [--cat-cols CAT_COLS]
                             [--cat-cols-xgboost CAT_COLS_XGBOOST] [--train-data-ratio TRAIN_DATA_RATIO]
                             [--valid-data-ratio VALID_DATA_RATIO] [--construct-homogeneous]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\Ashley He\AppData\Roaming\jupyter\runtime\kernel-824f6db3-8a1a-42c8-81b8-06dbe0d3c1d8.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [123]:
# extract out transactions for train, validation, and test data
train_data_ratio = 0.7 # fraction of data to use in training set
valid_data_ratio = 0.2 # fraction of data to use in validation set
n_train = int(transactions_df.shape[0]*train_data_ratio)
n_valid = int(transactions_df.shape[0]*(train_data_ratio+valid_data_ratio))
train_ids = transactions_df.TransactionID.values[:n_train]
valid_ids = transactions_df.TransactionID.values[n_train:n_valid]
test_ids = transactions_df.TransactionID.values[n_valid:]
print("Training, validation, and test data fraction are {}, {}, and {:.1f}, respectively".format(train_data_ratio, valid_data_ratio, 1-train_data_ratio-valid_data_ratio))

Training, validation, and test data fraction are 0.7, 0.2, and 0.1, respectively


In [108]:
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
print("Percentage of fraud transactions for train data: {}".format(get_fraud_frac(transactions_df.isFraud[:n_train])))
print("Percentage of fraud transactions for validation data: {}".format(get_fraud_frac(transactions_df.isFraud[n_train:n_valid])))
print("Percentage of fraud transactions for test data: {}".format(get_fraud_frac(transactions_df.isFraud[n_valid:])))
print("Percentage of fraud transactions for all data: {}".format(get_fraud_frac(transactions_df.isFraud)))

Percentage of fraud transactions for train data: 14.226145678660782
Percentage of fraud transactions for validation data: 13.942478821100591
Percentage of fraud transactions for test data: 14.209439738022354
Percentage of fraud transactions for all data: 14.167740465732164


In [111]:
transactions_id_cols = 'card_no,card_type,email_domain'
transactions_cat_cols = 'ProductCD'
transactions_cat_cols_xgboost = 'card_type,ProductCD'

In [118]:
# Get features
non_feature_cols = ['isFraud', 'TransactionDT'] + transactions_id_cols.split(",")
feature_cols = [col for col in transactions_df.columns if col not in non_feature_cols]
print("Categorical columns: {}".format(transactions_cat_cols.split(",")))

# Performing numerical encoding for categorical variables.
features = pd.get_dummies(transactions_df[feature_cols], columns=transactions_cat_cols.split(",")).fillna(0)

# Performing logarithmic transformation for transaction amount.
features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10)

display(features)
print("Transformed feature columns: {}".format(list(features.columns)))
print("Shape of features: {}".format(features.shape))
#features.to_csv(os.path.join(output_dir, 'features.csv'), index=False, header=False)
#print("Wrote features to file: {}".format(os.path.join(output_dir, 'features.csv')))


print("\nProcessing feature columns for XGBoost.")
cat_cols_xgb = transactions_cat_cols_xgboost.split(",")
print("Categorical feature columns for XGBoost: {}".format(cat_cols_xgb))
print("Numerical feature column for XGBoost: 'TransactionAmt'")

# Performing numerical encoding for categorical variables
features_xgb = pd.get_dummies(transactions_df[['TransactionID']+cat_cols_xgb], columns=cat_cols_xgb).fillna(0)

# Performing logarithmic transformation for transaction amount.
features_xgb['TransactionAmt'] = features['TransactionAmt']
display(features_xgb)
# features_xgb.to_csv(os.path.join(output_dir, 'features_xgboost.csv'), index=False, header=False)
# logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features_xgboost.csv')))

Categorical columns: ['ProductCD']


Unnamed: 0,TransactionID,TransactionAmt,ProductCD_45,ProductCD_AB,ProductCD_L,ProductCD_T,ProductCD_Y
0,9c90c7e2-2600-4628-a868-179287eee169,3.078457,0,0,0,1,0
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,2.611723,0,0,0,0,1
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,3.041787,0,0,0,0,1
3,5288f832-6673-473a-80f6-fdb98dd99278,3.347915,0,0,0,0,1
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,3.378943,0,0,1,0,0
...,...,...,...,...,...,...,...
140462,5413b45c-a381-4de7-865d-faf2f264ca58,1.462398,0,0,1,0,0
140463,b14c6e19-ea16-4234-bb82-2105e15416f2,3.526339,0,0,0,1,0
140464,855be236-7e2a-4fd7-a86a-3de47d6ad799,2.836324,0,1,0,0,0
140465,78ac5313-ede6-43cb-84ed-1748e27cd4c8,2.835691,0,0,0,0,1


Transformed feature columns: ['TransactionID', 'TransactionAmt', 'ProductCD_45', 'ProductCD_AB', 'ProductCD_L', 'ProductCD_T', 'ProductCD_Y']
Shape of features: (140467, 7)

Processing feature columns for XGBoost.
Categorical feature columns for XGBoost: ['card_type', 'ProductCD']
Numerical feature column for XGBoost: 'TransactionAmt'


Unnamed: 0,TransactionID,card_type_American Express,card_type_Diners Club / Carte Blanche,card_type_Discover,card_type_JCB 15 digit,card_type_JCB 16 digit,card_type_Maestro,card_type_Mastercard,card_type_VISA 13 digit,card_type_VISA 16 digit,card_type_VISA 19 digit,ProductCD_45,ProductCD_AB,ProductCD_L,ProductCD_T,ProductCD_Y,TransactionAmt
0,9c90c7e2-2600-4628-a868-179287eee169,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,3.078457
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2.611723
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3.041787
3,5288f832-6673-473a-80f6-fdb98dd99278,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,3.347915
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,3.378943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140462,5413b45c-a381-4de7-865d-faf2f264ca58,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1.462398
140463,b14c6e19-ea16-4234-bb82-2105e15416f2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,3.526339
140464,855be236-7e2a-4fd7-a86a-3de47d6ad799,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2.836324
140465,78ac5313-ede6-43cb-84ed-1748e27cd4c8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2.835691


In [142]:
X_train = features_xgb.copy()
X_train = X_train.set_index('TransactionID')
y_train = transactions_df[['TransactionID', 'isFraud']].copy()
y_train = y_train.set_index('TransactionID')
cols = list(X_train.columns)
display(X_train, y_train,cols)

Unnamed: 0_level_0,card_type_American Express,card_type_Diners Club / Carte Blanche,card_type_Discover,card_type_JCB 15 digit,card_type_JCB 16 digit,card_type_Maestro,card_type_Mastercard,card_type_VISA 13 digit,card_type_VISA 16 digit,card_type_VISA 19 digit,ProductCD_45,ProductCD_AB,ProductCD_L,ProductCD_T,ProductCD_Y,TransactionAmt
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9c90c7e2-2600-4628-a868-179287eee169,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,3.078457
02951e68-f8ff-4f00-b515-5df8bcb0edcb,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2.611723
17071ccd-3c3d-476a-a3cd-b73638a803d4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3.041787
5288f832-6673-473a-80f6-fdb98dd99278,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,3.347915
e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,3.378943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5413b45c-a381-4de7-865d-faf2f264ca58,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1.462398
b14c6e19-ea16-4234-bb82-2105e15416f2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,3.526339
855be236-7e2a-4fd7-a86a-3de47d6ad799,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2.836324
78ac5313-ede6-43cb-84ed-1748e27cd4c8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2.835691


Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
9c90c7e2-2600-4628-a868-179287eee169,0
02951e68-f8ff-4f00-b515-5df8bcb0edcb,0
17071ccd-3c3d-476a-a3cd-b73638a803d4,0
5288f832-6673-473a-80f6-fdb98dd99278,0
e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,0
...,...
5413b45c-a381-4de7-865d-faf2f264ca58,0
b14c6e19-ea16-4234-bb82-2105e15416f2,0
855be236-7e2a-4fd7-a86a-3de47d6ad799,0
78ac5313-ede6-43cb-84ed-1748e27cd4c8,0


['card_type_American Express',
 'card_type_Diners Club / Carte Blanche',
 'card_type_Discover',
 'card_type_JCB 15 digit',
 'card_type_JCB 16 digit',
 'card_type_Maestro',
 'card_type_Mastercard',
 'card_type_VISA 13 digit',
 'card_type_VISA 16 digit',
 'card_type_VISA 19 digit',
 'ProductCD_45',
 'ProductCD_AB',
 'ProductCD_L',
 'ProductCD_T',
 'ProductCD_Y',
 'TransactionAmt']

In [143]:
idxT = X_train.index[:n_train]
idxV = X_train.index[n_train:n_valid]

In [163]:
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector

In [170]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

if True:
    clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        #nthread=4,
        #tree_method='hist' 
        # USE GPU
        tree_method='gpu_hist' 
    )
    h = clf.fit(X_train.loc[idxT,cols], y_train.loc[idxT], 
        eval_set=[(X_train.loc[idxV,cols],y_train.loc[idxV])],
        verbose=20, early_stopping_rounds=200)

XGBoost version: 1.7.5
[0]	validation_0-auc:0.78743




[20]	validation_0-auc:0.84619
[40]	validation_0-auc:0.85254
[60]	validation_0-auc:0.85310
[80]	validation_0-auc:0.85352
[100]	validation_0-auc:0.85331
[120]	validation_0-auc:0.85306
[140]	validation_0-auc:0.85310
[160]	validation_0-auc:0.85303
[180]	validation_0-auc:0.85333
[200]	validation_0-auc:0.85323
[220]	validation_0-auc:0.85307
[240]	validation_0-auc:0.85312
[260]	validation_0-auc:0.85334
[280]	validation_0-auc:0.85363
[300]	validation_0-auc:0.85351
[320]	validation_0-auc:0.85320
[340]	validation_0-auc:0.85300
[360]	validation_0-auc:0.85302
[380]	validation_0-auc:0.85311
[400]	validation_0-auc:0.85294
[420]	validation_0-auc:0.85307
[440]	validation_0-auc:0.85313
[460]	validation_0-auc:0.85311
[480]	validation_0-auc:0.85310
[490]	validation_0-auc:0.85293


In [168]:
print("Best Iteration: {}".format(clf.best_iteration))

Best Iteration: 291


In [151]:
clf.best_iteration

291

In [157]:
clf_best = xgb.XGBClassifier(
    n_estimators=clf.best_iteration,
    max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        #nthread=4,
        #tree_method='hist' 
        # USE GPU
        tree_method='gpu_hist'
    )

In [162]:
a = clf_best.fit(X_train.loc[idxT,cols], y_train.loc[idxT], 
        eval_set=[(X_train.loc[idxV,cols],y_train.loc[idxV])],verbose=clf.best_iteration)

[0]	validation_0-auc:0.78743
[290]	validation_0-auc:0.85362
