# Import Libraries

In [1]:
import pandas as pd
import os

# Import Datasets

In [22]:
os.chdir('/content/drive/MyDrive/Colab/Datasets/fraud_detection_sets')

# Load datasets
df_transactions = pd.read_csv('transactions_data.csv')
df_users = pd.read_csv('users_data.csv')
df_cards = pd.read_csv('cards_data.csv')
df_fraud_labels = pd.read_json('train_fraud_labels.json').reset_index() # Use read_json to read json file, index the records
df_mcc_codes = pd.read_csv('mcc_codes.csv')

# Function to display dataset details
def display_df_info(df, name):
    print(f"\n{name} - Info:")
    print(df.info())
    print(f"\n{name} - Head:")
    print(df.head(100))

# Display info for each dataframe
display_df_info(df_transactions, "Transactions")
display_df_info(df_users, "Users")
display_df_info(df_cards, "Cards")
display_df_info(df_fraud_labels, "Fraud Labels")
display_df_info(df_mcc_codes, "MCC Codes")


Transactions - Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2661183 entries, 0 to 2661182
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   id              int64  
 1   date            object 
 2   client_id       int64  
 3   card_id         int64  
 4   amount          object 
 5   use_chip        object 
 6   merchant_id     int64  
 7   merchant_city   object 
 8   merchant_state  object 
 9   zip             float64
 10  mcc             int64  
 11  errors          object 
dtypes: float64(1), int64(5), object(6)
memory usage: 243.6+ MB
None

Transactions - Head:
         id                 date  client_id  card_id   amount  \
0   7565784  2010-01-24 09:24:00       1645     1047    $3.39   
1   7571127  2010-01-25 13:52:00       1448     5907   $75.62   
2   7499145  2010-01-07 05:45:00        327     5512   $48.88   
3   7566958  2010-01-24 13:29:00       1940     4218   $37.81   
4   7586425  2010-01-29 12:16:00       1654

# Data Preprocessing

In [28]:
# Check for missing values in all datasets
def check_missing_values(df):
    return df.isnull().sum()

# Print sum of missing values for each set
print(f"\n{check_missing_values(df_transactions)}")
print(f"\n{check_missing_values(df_users)}")
print(f"\n{check_missing_values(df_cards)}")
print(f"\n{check_missing_values(df_fraud_labels)}")
print(f"\n{check_missing_values(df_mcc_codes)}")

# Drop duplicates for all datasets if any
df_transactions.drop_duplicates(inplace=True)
df_users.drop_duplicates(inplace=True)
df_cards.drop_duplicates(inplace=True)
df_fraud_labels.drop_duplicates(inplace=True)
df_mcc_codes.drop_duplicates(inplace=True)

# Drop merchant_state, zip and errors column on transactions df
df_transactions.drop(['merchant_state', 'zip', 'errors'], axis=1, inplace=True)

# Convert amount column from string to numeric on transactions df
df_transactions['amount'] = df_transactions['amount'].str.lstrip('$').astype(float)

# Convert amount column from string to numeric on users df
df_users['per_capita_income'] = df_users['per_capita_income'].str.lstrip('$').astype(float)
df_users['yearly_income'] = df_users['yearly_income'].str.lstrip('$').astype(float)
df_users['total_debt'] = df_users['total_debt'].str.lstrip('$').astype(float)

# Convert amount column from string to numeric on cards df
df_cards['credit_limit'] = df_cards['credit_limit'].str.lstrip('$').astype(float)

df_cards


id               0
date             0
client_id        0
card_id          0
amount           0
use_chip         0
merchant_id      0
merchant_city    0
mcc              0
dtype: int64

id                   0
current_age          0
retirement_age       0
birth_year           0
birth_month          0
gender               0
address              0
latitude             0
longitude            0
per_capita_income    0
yearly_income        0
total_debt           0
credit_score         0
num_credit_cards     0
dtype: int64

id                       0
client_id                0
card_brand               0
card_type                0
card_number              0
expires                  0
cvv                      0
has_chip                 0
num_cards_issued         0
credit_limit             0
acct_open_date           0
year_pin_last_changed    0
card_on_dark_web         0
dtype: int64

index     0
target    0
dtype: int64

MCC            0
Description    0
dtype: int64


Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,24295.0,09/2002,2008,No
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,21968.0,04/2014,2014,No
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,46414.0,07/2003,2004,No
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,12400.0,01/2003,2012,No
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,28.0,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,5361,185,Amex,Credit,300609782832003,01/2024,663,YES,1,6900.0,11/2000,2013,No
6142,2711,185,Visa,Credit,4718517475996018,01/2021,492,YES,2,5700.0,04/2012,2012,No
6143,1305,1007,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,9200.0,02/2012,2012,No
6144,743,1110,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,28074.0,01/2020,2020,No
