In [1]:
import pandas as pd
import json

# --- 1. Load Data ---
# Load CSV files
cards_data = pd.read_csv('cards_data.csv')
transactions_data = pd.read_csv('transactions_data.csv')
users_data = pd.read_csv('users_data.csv')

# Load and process mcc_codes.json
with open("mcc_codes.json", "r", encoding="utf-8") as f:
    mcc_dict = json.load(f)

rows_mcc = []
for code_str, desc in mcc_dict.items():
    rows_mcc.append({
        "mcc": int(code_str),
        "mcc_description": desc
    })
mcc_codes_df = pd.DataFrame(rows_mcc)

# Load and process train_fraud_labels.json
with open("train_fraud_labels.json", "r", encoding="utf-8") as f:
    labels_dict = json.load(f)

inner_labels = labels_dict["target"]
rows_fraud = []
for id_str, label_str in inner_labels.items():
    rows_fraud.append({
        "id": int(id_str),
        "is_fraud": 1 if label_str.lower() == "yes" else 0
    })
fraud_labels_df = pd.DataFrame(rows_fraud)

# --- 2. Merge DataFrames ---
# Merge with mcc_codes_df (left join to keep all transactions)
merged = transactions_data.merge(mcc_codes_df, on='mcc', how='left')

# Merge with fraud_labels_df (left join, fill NaN in is_fraud with 0)
merged = merged.merge(fraud_labels_df, on='id', how='left')
merged['is_fraud'] = merged['is_fraud'].fillna(0).astype('int8')

# Merge with users_data using INNER JOIN (only transactions with associated user)
# Note: The original code merges on 'client_id' from 'merged' and 'id' from 'users_data'
# and then suffixes the 'id' column from users_data as 'id_user'.
merged = merged.merge(users_data, left_on='client_id', right_on='id', how='inner', suffixes=('', '_user'))

# --- 3. Reduce Dataset Size (as per original notebook's intent for training) ---
# Separate fraud and non-fraud rows
fraud_df = merged[merged['is_fraud'] == 1]
nonfraud_df = merged[merged['is_fraud'] == 0]

# Calculate number of non-fraud samples needed to reach 1 million total rows
num_fraud = len(fraud_df)
target_total = 1_000_000
num_to_sample = target_total - num_fraud

# Randomly sample non-fraud rows
sampled_nonfraud = nonfraud_df.sample(n=num_to_sample, random_state=42)

# Concatenate all fraud records with the sampled non-fraud records
reduced_df = pd.concat([fraud_df, sampled_nonfraud])

# Shuffle the resulting DataFrame to prevent ordering bias
merged = reduced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- 4. Data Cleaning and Feature Engineering ---
# Clean monetary columns: remove '$' and ',' and convert to float
for col in ['amount', 'per_capita_income', 'yearly_income', 'total_debt']:
    # Ensure column is string type before replacement to avoid errors with non-string types
    merged[col] = merged[col].astype(str).str.replace(r'[\$,]', '', regex=True)
    merged[col] = pd.to_numeric(merged[col], errors='coerce') # 'coerce' will turn unparseable values into NaN

# Convert 'date' to datetime and create new time-based features
merged['date'] = pd.to_datetime(merged['date'], format='%Y-%m-%d %H:%M:%S')

# Hour of the day
merged['hour'] = merged['date'].dt.hour

# Day of the week (0 = Monday, 6 = Sunday)
merged['weekday'] = merged['date'].dt.weekday

# Weekend indicator (1 = Saturday/Sunday, 0 = otherwise)
merged['is_weekend'] = merged['weekday'].isin([5, 6]).astype(int)

# Month of the year
merged['month'] = merged['date'].dt.month

# Night transaction indicator (< 6 AM or > 10 PM)
merged['is_night'] = merged['hour'].apply(lambda x: 1 if (x < 6 or x > 22) else 0)

# --- 5. Export Processed Dataset to CSV ---
# The 'id' column from users_data was renamed to 'id_user' during merge
# The original 'id' column refers to transaction ID.
# Drop redundant ID columns if not needed for training, or keep as per user requirement.
# For a clean training dataset, it's often good to keep only relevant features.
# I'll keep both 'id' (transaction) and 'id_user' for traceability.
output_columns = [col for col in merged.columns if col not in ['date']] # Remove original 'date' column as features derived from it are created

# Save the processed DataFrame to a CSV file
merged[output_columns].to_csv('processed_dataset.csv', index=False)

print("Dataset successfully processed and saved to 'processed_dataset.csv'")


Dataset successfully processed and saved to 'processed_dataset.csv'


In [5]:
merged.columns

Index(['id', 'date', 'client_id', 'card_id', 'amount', 'use_chip',
       'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc',
       'errors', 'mcc_description', 'is_fraud', 'id_user', 'current_age',
       'retirement_age', 'birth_year', 'birth_month', 'gender', 'address',
       'latitude', 'longitude', 'per_capita_income', 'yearly_income',
       'total_debt', 'credit_score', 'num_credit_cards', 'hour', 'weekday',
       'is_weekend', 'month', 'is_night'],
      dtype='object')