In [21]:
import pandas as pd

df = pd.read_parquet("transactions_clean.parquet")

# Datetime
df["date"] = pd.to_datetime(df["date"])

# ID columns
df["client_id"]   = df["client_id"].astype("int32")
df["card_id"]     = df["card_id"].astype("int32")
df["merchant_id"] = df["merchant_id"].astype("int32")
df["mcc"]         = df["mcc"].astype("int16")

# Amount
df["amount"] = df["amount"].astype("float32")

# Categorical features
for c in ["use_chip", "merchant_city", "merchant_state", "zip"]:
    df[c] = df[c].astype("category")

# Error flags
for c in [
    "has_error",
    "err_card_credential",
    "err_authentication",
    "err_financial",
    "err_system"
]:
    df[c] = df[c].astype("int8")

# Target
df["fraud"] = df["fraud"].astype("int8")

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8851561 entries, 0 to 8851560
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int32         
 2   card_id              int32         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int32         
 6   merchant_city        category      
 7   merchant_state       category      
 8   zip                  category      
 9   mcc                  int16         
 10  fraud                int8          
 11  has_error            int8          
 12  err_card_credential  int8          
 13  err_authentication   int8          
 14  err_financial        int8          
 15  err_system           int8          
dtypes: category(4), datetime64[ns](1), float32(1), int16(1), int32(3), int8(6)
memory usage: 323.8 MB


In [22]:
df

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,fraud,has_error,err_card_credential,err_authentication,err_financial,err_system
0,2010-01-01 00:01:00,1556,2972,-77.000000,Swipe Transaction,59935,Beulah,ND,58523.0,5499,0,0,0,0,0,0
1,2010-01-01 00:02:00,561,4575,14.570000,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,0,0,0,0,0,0
2,2010-01-01 00:02:00,1129,102,80.000000,Swipe Transaction,27092,Vista,CA,92084.0,4829,0,0,0,0,0,0
3,2010-01-01 00:06:00,848,3915,46.410000,Swipe Transaction,13051,Harwood,MD,20776.0,5813,0,0,0,0,0,0
4,2010-01-01 00:07:00,1807,165,4.810000,Swipe Transaction,20519,Bronx,NY,10464.0,5942,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8851556,2019-10-31 23:54:00,672,5001,12.930000,Chip Transaction,47508,Cosby,TN,37722.0,7230,0,0,0,0,0,0
8851557,2019-10-31 23:54:00,1384,3723,67.010002,Chip Transaction,58136,Williamson,GA,30292.0,5812,0,0,0,0,0,0
8851558,2019-10-31 23:56:00,1718,2379,1.110000,Chip Transaction,86438,West Covina,CA,91792.0,5499,0,0,0,0,0,0
8851559,2019-10-31 23:56:00,1766,2066,12.800000,Online Transaction,39261,ONLINE,ONLINE,ONLINE,5815,0,0,0,0,0,0


In [23]:
card = pd.read_csv("cards_data.csv")
user = pd.read_csv("users_data.csv")

In [24]:
print("[Transactions Data]")
for col in df.columns:
    print("column: ", col)
    print("values: ", df[col].unique)

print("[Card Data]")
for col in card.columns:
    print("column: ", col)
    print("values: ", card[col].unique)

print("[User Data]")
for col in user.columns:
    print("column: ", col)
    print("values: ", user[col].unique)


[Transactions Data]
column:  date
values:  <bound method Series.unique of 0         2010-01-01 00:01:00
1         2010-01-01 00:02:00
2         2010-01-01 00:02:00
3         2010-01-01 00:06:00
4         2010-01-01 00:07:00
                  ...        
8851556   2019-10-31 23:54:00
8851557   2019-10-31 23:54:00
8851558   2019-10-31 23:56:00
8851559   2019-10-31 23:56:00
8851560   2019-10-31 23:57:00
Name: date, Length: 8851561, dtype: datetime64[ns]>
column:  client_id
values:  <bound method Series.unique of 0          1556
1           561
2          1129
3           848
4          1807
           ... 
8851556     672
8851557    1384
8851558    1718
8851559    1766
8851560     199
Name: client_id, Length: 8851561, dtype: int32>
column:  card_id
values:  <bound method Series.unique of 0          2972
1          4575
2           102
3          3915
4           165
           ... 
8851556    5001
8851557    3723
8851558    2379
8851559    2066
8851560    1031
Name: card_id, Length: 88515

In [25]:
card

Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,5361,185,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,2711,185,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1305,1007,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,743,1110,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [26]:
user

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,825,53,66,1966,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
1,1746,53,68,1966,12,Female,3606 Federal Boulevard,40.76,-73.74,$37891,$77254,$191349,701,5
2,1718,81,67,1938,11,Female,766 Third Drive,34.02,-117.89,$22681,$33483,$196,698,5
3,708,63,63,1957,1,Female,3 Madison Street,40.71,-73.99,$163145,$249925,$202328,722,4
4,1164,43,70,1976,9,Male,9620 Valley Stream Drive,37.76,-122.44,$53797,$109687,$183855,675,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,986,32,70,1987,7,Male,6577 Lexington Lane,40.65,-73.58,$23550,$48010,$87837,703,3
1996,1944,62,65,1957,11,Female,2 Elm Drive,38.95,-84.54,$24218,$49378,$104480,740,4
1997,185,47,67,1973,1,Female,276 Fifth Boulevard,40.66,-74.19,$15175,$30942,$71066,779,3
1998,1007,66,60,1954,2,Male,259 Valley Boulevard,40.24,-76.92,$25336,$54654,$27241,618,1


In [27]:
card = card.sort_values(by="client_id", ascending=True)
df = df.sort_values(by="client_id", ascending=True)
user = user.sort_values(by="id", ascending=True)

In [28]:
card_id = card["client_id"].unique()
trans_id = df["client_id"].unique()
user_id = user["id"].unique()

In [29]:
common = set(card_id) & set(trans_id) & set(user_id)
print("len of common: ", len(common))
print("unique of common:", common)

len of common:  1219
unique of common: {np.int32(0), np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(11), np.int32(13), np.int32(14), np.int32(16), np.int32(17), np.int32(19), np.int32(20), np.int32(21), np.int32(22), np.int32(24), np.int32(27), np.int32(28), np.int32(32), np.int32(33), np.int32(34), np.int32(37), np.int32(38), np.int32(39), np.int32(40), np.int32(42), np.int32(44), np.int32(45), np.int32(46), np.int32(47), np.int32(48), np.int32(50), np.int32(51), np.int32(52), np.int32(53), np.int32(55), np.int32(57), np.int32(58), np.int32(59), np.int32(60), np.int32(61), np.int32(62), np.int32(63), np.int32(64), np.int32(65), np.int32(68), np.int32(69), np.int32(73), np.int32(74), np.int32(75), np.int32(77), np.int32(78), np.int32(79), np.int32(80), np.int32(81), np.int32(84), np.int32(86), np.int32(87), np.int32(89), np.int32(90), np.int32(92), np.int32(94), np.int32(96), np.int32(98), np.int32(100), np.int32(103), np.int32(104), np.int32(106), np.int32(1

In [30]:
card_common = card[card["client_id"].isin(common)]
trans_common = df[df["client_id"].isin(common)]
user_common = user[user["id"].isin(common)]
print("nunique check:", card_common["client_id"].nunique(), trans_common["client_id"].nunique(), user_common["id"].nunique())
print("card shape:", card_common.shape)
print("trans shape:", trans_common.shape)
print("user shape:", user_common.shape)

nunique check: 1219 1219 1219
card shape: (4514, 13)
trans shape: (8851561, 16)
user shape: (1219, 14)


In [31]:
common_card_ids = (
    set(trans_common["card_id"].unique()) & set(card_common["id"].unique())
)
len(common_card_ids)

4070

In [32]:
card_common["id"].duplicated().any()

np.False_

In [33]:
trans_common["card_id"].nunique()

4070

In [34]:
card_common["id"].nunique()

4514

In [38]:
card_common = card_common[card_common["id"].isin(common_card_ids)]

In [39]:
card_common

Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
2911,1271,0,Mastercard,Debit,5050211780967429,04/2021,316,YES,2,$31490,02/2011,2011,No
2910,4639,0,Mastercard,Credit,5802759460691737,12/2019,312,YES,1,$17600,09/2007,2014,No
3986,4652,1,Visa,Credit,4419924074647230,12/2021,1,NO,1,$12800,09/2007,2011,No
3985,3682,1,Visa,Credit,4417513283605637,04/2014,84,YES,1,$10900,07/2002,2013,No
3984,4400,1,Visa,Debit,4843491272960882,01/2017,625,YES,1,$18105,08/2012,2012,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,5106,1997,Mastercard,Debit,5138861544730253,09/2014,695,YES,2,$3991,10/2008,2013,No
1091,4807,1997,Mastercard,Debit,5974521557482725,08/2020,89,YES,1,$25048,09/2011,2011,No
4410,2160,1998,Visa,Debit (Prepaid),4969817527377037,11/2022,756,YES,1,$110,03/2007,2007,No
4412,1258,1998,Visa,Credit,4573665962578726,11/2020,452,YES,2,$100,02/2010,2011,No


In [41]:
card_common.to_csv("card_common.csv")
user_common.to_csv("user_common.csv")
trans_common.to_csv("trans_common.csv")